Skip to main content

hash_roll/
gzip.rs

1#![cfg(feature = "gzip")]
2
3//! gzip (forks) rsyncable mode that uses a simple window accumulator
4//!
5//! **WARNING: not validated against gzip or rsyncrypto algorithms. Output may change to fix bugs**
6//!
7//! A widely distributed patch for gzip adds a `--rsyncable` flag which causes `gzip` to split it's
8//! input on "stable" boundaries. This module impliments the algorithm used in that patch.
9//!
10//! The same algorithm is used by the `rsyncrypto` project.
11//!
12//!  - No maximum block size is provided.
13//!  - No minimum block size is provided.
14//!
15//! PDF of block sizes: ???
16//!
17//! Note that the defacto-standard parameters allow a slightly more efficient check for a block
18//! split (by replacing a modulus with a bitwise-and). This impl currently doesn't allow that
19//! optimization even if you provide appropriate parameters (we need type-level integers for that).
20//!
21//! Parameters:
22//!
23//!  - window-len: The maximum number of bytes to be examined when deciding to split a block.
24//!              set to 8192 by default in gzip-rsyncable & rsyncrypto)
25//!  - modulus:    set to half of window-len (so, 4096) in gzip-rsyncable & rsyncrypto.
26//!
27//! In-block state:
28//!  - window of window-len bytes (use of the iterator interface means we also track more bytes than
29//!      this)
30//!  - sum (u64)
31//!
32//! Between-block state:
33//!
34//! - none
35//!
36//! References:
37//!
38//! - http://rsyncrypto.lingnu.com/index.php/Algorithm
39//! - https://www.samba.org/~tridge/phd_thesis.pdf
40//!
41//! S(n) = sum(c_i, var=i, top=n, bottom=n-8196)
42//!
43//! A(n) = S(n) / 8192
44//!
45//! H(n) = S(n) mod 4096
46//!
47//! Trigger splits when H(n) == 0
48
49use crate::{Chunk, ChunkIncr, ToChunkIncr};
50use std::collections::VecDeque;
51use std::num::Wrapping;
52
53/// Parameters for defining the gzip rsyncable algorithm
54#[derive(Clone, Debug, PartialEq, Eq)]
55pub struct GzipRsyncable {
56    /*
57     * TODO: if we can avoid loading entire files into memory, this could be u64
58     */
59    window_len: usize,
60    modulus: u64,
61}
62
63impl GzipRsyncable {
64    pub fn with_window_and_modulus(window: usize, modulus: u64) -> GzipRsyncable {
65        Self {
66            window_len: window,
67            modulus,
68        }
69    }
70}
71
72impl Default for GzipRsyncable {
73    fn default() -> Self {
74        Self::with_window_and_modulus(8192, 4096)
75    }
76}
77
78impl Chunk for GzipRsyncable {
79    type SearchState = GzipRsyncableSearchState;
80
81    fn to_search_state(&self) -> Self::SearchState {
82        Self::SearchState::default()
83    }
84
85    fn find_chunk_edge(
86        &self,
87        state: &mut Self::SearchState,
88        data: &[u8],
89    ) -> (Option<usize>, usize) {
90        for i in state.offset..data.len() {
91            let v = data[i];
92
93            if state.state.add(data, self, i, v) {
94                state.reset();
95                return (Some(i + 1), i + 1);
96            }
97        }
98
99        // keep k elements = discard all but k
100        let discard_ct = data.len().saturating_sub(self.window_len);
101        state.offset = data.len() - discard_ct;
102        (None, discard_ct)
103    }
104}
105
106impl From<&GzipRsyncable> for GzipRsyncableIncr {
107    fn from(src: &GzipRsyncable) -> Self {
108        src.clone().into()
109    }
110}
111
112impl ToChunkIncr for GzipRsyncable {
113    type Incr = GzipRsyncableIncr;
114    fn to_chunk_incr(&self) -> Self::Incr {
115        self.into()
116    }
117}
118
119#[derive(Debug, Default, Clone)]
120struct GzipRsyncableState {
121    accum: Wrapping<u64>,
122}
123
124impl GzipRsyncableState {
125    fn reset(&mut self) {
126        self.accum.0 = 0;
127    }
128}
129
130/// Intermediate state for [`GzipRsyncable::find_chunk_edge`]
131///
132/// Using this avoids re-computation of data when no edge is found
133#[derive(Debug, Default, Clone)]
134pub struct GzipRsyncableSearchState {
135    offset: usize,
136    state: GzipRsyncableState,
137}
138
139impl GzipRsyncableSearchState {
140    fn reset(&mut self) {
141        self.offset = 0;
142        self.state.reset();
143    }
144}
145
146/// Provides an incremental interface to [`GzipRsyncable`]
147///
148/// Performance Note: [`GzipRsyncable`] requires look-back. As a result, [`GzipRsyncableIncr`] internally
149/// buffers data up to the window size. This additional copying may affect performance. If
150/// possible for your use case, use the non-incremental interface.
151///
152/// See [`GzipRsyncable`] for details on the underlying algorithm
153#[derive(Debug, Clone)]
154pub struct GzipRsyncableIncr {
155    params: GzipRsyncable,
156
157    accum: Wrapping<u64>,
158    // really poor efficiency
159    window: VecDeque<u8>,
160}
161
162impl GzipRsyncableIncr {
163    fn reset(&mut self) {
164        self.window.clear();
165        self.accum = Wrapping(0);
166    }
167}
168
169impl From<GzipRsyncable> for GzipRsyncableIncr {
170    fn from(params: GzipRsyncable) -> Self {
171        let window = VecDeque::with_capacity(params.window_len);
172        GzipRsyncableIncr {
173            params,
174            accum: Wrapping(0),
175            window,
176        }
177    }
178}
179
180impl GzipRsyncableState {
181    fn add(&mut self, data: &[u8], parent: &GzipRsyncable, i: usize, v: u8) -> bool {
182        if i >= parent.window_len {
183            self.accum -= Wrapping(data[i - parent.window_len] as u64);
184        }
185        self.accum += Wrapping(v as u64);
186        (self.accum % Wrapping(parent.modulus)).0 == 0
187    }
188}
189
190impl ChunkIncr for GzipRsyncableIncr {
191    fn push(&mut self, data: &[u8]) -> Option<usize> {
192        for (i, &v) in data.iter().enumerate() {
193            if self.window.len() >= self.params.window_len {
194                self.accum -= Wrapping(self.window.pop_front().unwrap() as u64);
195            }
196
197            self.accum += Wrapping(v as u64);
198            self.window.push_back(v);
199
200            if (self.accum % Wrapping(self.params.modulus)).0 == 0 {
201                self.reset();
202                return Some(i + 1);
203            }
204        }
205
206        None
207    }
208}