1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
#![cfg(feature = "gzip")]

//! gzip (forks) rsyncable mode that uses a simple window accumulator
//!
//! **WARNING: not validated against gzip or rsyncrypto algorithms. Output may change to fix bugs**
//!
//! A widely distributed patch for gzip adds a `--rsyncable` flag which causes `gzip` to split it's
//! input on "stable" boundaries. This module impliments the algorithm used in that patch.
//!
//! The same algorithm is used by the `rsyncrypto` project.
//!
//!  - No maximum block size is provided.
//!  - No minimum block size is provided.
//!
//! PDF of block sizes: ???
//!
//! Note that the defacto-standard parameters allow a slightly more efficient check for a block
//! split (by replacing a modulus with a bitwise-and). This impl currently doesn't allow that
//! optimization even if you provide appropriate parameters (we need type-level integers for that).
//!
//! Parameters:
//!
//!  - window-len: The maximum number of bytes to be examined when deciding to split a block.
//!              set to 8192 by default in gzip-rsyncable & rsyncrypto)
//!  - modulus:    set to half of window-len (so, 4096) in gzip-rsyncable & rsyncrypto.
//!
//! In-block state:
//!  - window of window-len bytes (use of the iterator interface means we also track more bytes than
//!      this)
//!  - sum (u64)
//!
//! Between-block state:
//!
//! - none
//!
//! References:
//!
//! - http://rsyncrypto.lingnu.com/index.php/Algorithm
//! - https://www.samba.org/~tridge/phd_thesis.pdf
//!
//! S(n) = sum(c_i, var=i, top=n, bottom=n-8196)
//!
//! A(n) = S(n) / 8192
//!
//! H(n) = S(n) mod 4096
//!
//! Trigger splits when H(n) == 0

use crate::{Chunk, ChunkIncr, ToChunkIncr};
use std::collections::VecDeque;
use std::num::Wrapping;

/// Parameters for defining the gzip rsyncable algorithm
#[derive(Clone, Debug, PartialEq, Eq)]
pub struct GzipRsyncable {
    /*
     * TODO: if we can avoid loading entire files into memory, this could be u64
     */
    window_len: usize,
    modulus: u64,
}

impl GzipRsyncable {
    pub fn with_window_and_modulus(window: usize, modulus: u64) -> GzipRsyncable {
        Self {
            window_len: window,
            modulus,
        }
    }
}

impl Default for GzipRsyncable {
    fn default() -> Self {
        Self::with_window_and_modulus(8192, 4096)
    }
}

impl Chunk for GzipRsyncable {
    type SearchState = GzipRsyncableSearchState;

    fn to_search_state(&self) -> Self::SearchState {
        Self::SearchState::default()
    }

    fn find_chunk_edge(
        &self,
        state: &mut Self::SearchState,
        data: &[u8],
    ) -> (Option<usize>, usize) {
        for i in state.offset..data.len() {
            let v = data[i];

            if state.state.add(data, self, i, v) {
                state.reset();
                return (Some(i + 1), i + 1);
            }
        }

        // keep k elements = discard all but k
        let discard_ct = data.len().saturating_sub(self.window_len);
        state.offset = data.len() - discard_ct;
        (None, discard_ct)
    }
}

impl From<&GzipRsyncable> for GzipRsyncableIncr {
    fn from(src: &GzipRsyncable) -> Self {
        src.clone().into()
    }
}

impl ToChunkIncr for GzipRsyncable {
    type Incr = GzipRsyncableIncr;
    fn to_chunk_incr(&self) -> Self::Incr {
        self.into()
    }
}

#[derive(Debug, Default, Clone)]
struct GzipRsyncableState {
    accum: Wrapping<u64>,
}

impl GzipRsyncableState {
    fn reset(&mut self) {
        self.accum.0 = 0;
    }
}

/// Intermediate state for [`GzipRsyncable::find_chunk_edge`]
///
/// Using this avoids re-computation of data when no edge is found
#[derive(Debug, Default, Clone)]
pub struct GzipRsyncableSearchState {
    offset: usize,
    state: GzipRsyncableState,
}

impl GzipRsyncableSearchState {
    fn reset(&mut self) {
        self.offset = 0;
        self.state.reset();
    }
}

/// Provides an incremental interface to [`GzipRsyncable`]
///
/// Performance Note: [`GzipRsyncable`] requires look-back. As a result, [`GzipRsyncableIncr`] internally
/// buffers data up to the window size. This additional copying may affect performance. If
/// possible for your use case, use the non-incremental interface.
///
/// See [`GzipRsyncable`] for details on the underlying algorithm
#[derive(Debug, Clone)]
pub struct GzipRsyncableIncr {
    params: GzipRsyncable,

    accum: Wrapping<u64>,
    // really poor efficiency
    window: VecDeque<u8>,
}

impl GzipRsyncableIncr {
    fn reset(&mut self) {
        self.window.clear();
        self.accum = Wrapping(0);
    }
}

impl From<GzipRsyncable> for GzipRsyncableIncr {
    fn from(params: GzipRsyncable) -> Self {
        let window = VecDeque::with_capacity(params.window_len);
        GzipRsyncableIncr {
            params,
            accum: Wrapping(0),
            window,
        }
    }
}

impl GzipRsyncableState {
    fn add(&mut self, data: &[u8], parent: &GzipRsyncable, i: usize, v: u8) -> bool {
        if i >= parent.window_len {
            self.accum -= Wrapping(data[i - parent.window_len] as u64);
        }
        self.accum += Wrapping(v as u64);
        (self.accum % Wrapping(parent.modulus)).0 == 0
    }
}

impl ChunkIncr for GzipRsyncableIncr {
    fn push(&mut self, data: &[u8]) -> Option<usize> {
        for (i, &v) in data.iter().enumerate() {
            if self.window.len() >= self.params.window_len {
                self.accum -= Wrapping(self.window.pop_front().unwrap() as u64);
            }

            self.accum += Wrapping(v as u64);
            self.window.push_back(v);

            if (self.accum % Wrapping(self.params.modulus)).0 == 0 {
                self.reset();
                return Some(i + 1);
            }
        }

        None
    }
}