Skip to main content

hash_roll/
fastcdc.rs

1#![cfg(feature = "fastcdc")]
2
3//! FastCDC is a chunking algorithm using some features from [Gear](super::gear)
4//!
5//! Reference:
6//!  - https://www.usenix.org/system/files/conference/atc16/atc16-paper-xia.pdf
7
8use crate::{Chunk, ToChunkIncr, ChunkIncr};
9use std::fmt;
10use std::num::Wrapping;
11
12// these masks are taken from the paper and could be adjusted/adjustable.
13const MASK_S: u64 = 0x0003590703530000;
14//const MASK_A: u64 = 0x0000d90303530000;
15const MASK_L: u64 = 0x0000d90003530000;
16
17/// An instance of the "FastCDC" algorithm
18///
19/// Default parameters:
20///  - Minimum chunk size: 2 KiB
21///  - Maximum chunk size: 64 KiB
22///  - Normal size: 8 KiB
23///  - internal 64-bit gear table: [`super::gear_table::GEAR_64`]
24///
25#[derive(Clone, Copy)]
26pub struct FastCdc<'a> {
27    gear: &'a [u64; 256],
28    min_size: u64,
29    max_size: u64,
30    normal_size: u64,
31}
32
33impl<'a> PartialEq for FastCdc<'a> {
34    fn eq(&self, other: &Self) -> bool {
35        self.min_size == other.min_size
36            && self.max_size == other.max_size
37            && self.normal_size == other.normal_size
38            && &self.gear[..] == &other.gear[..]
39    }
40}
41
42impl<'a> Eq for FastCdc<'a> {}
43
44impl<'a> Default for FastCdc<'a> {
45    fn default() -> Self {
46        FastCdc {
47            min_size: 2 * 1024,    // 2 KiB
48            max_size: 64 * 1024,   // 64 KiB
49            normal_size: 8 * 1024, // 8 KiB
50            gear: &super::gear_table::GEAR_64,
51        }
52    }
53}
54
55impl<'a> fmt::Debug for FastCdc<'a> {
56    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
57        f.debug_struct("FastCdc")
58            .field("gear", &"[...]")
59            .field("min_size", &self.min_size)
60            .field("max_size", &self.max_size)
61            .field("normal_size", &self.normal_size)
62            .finish()
63    }
64}
65
66impl<'a> Chunk for FastCdc<'a> {
67    type SearchState = FastCdcState;
68
69    fn to_search_state(&self) -> Self::SearchState {
70        Default::default()
71    }
72
73    fn find_chunk_edge(&self, state: &mut Self::SearchState, data: &[u8]) -> (Option<usize>, usize) {
74        match state.push(self, data) {
75            Some(i) => (Some(i + 1), i + 1),
76            None => (None, data.len()),
77        }
78    }
79}
80
81impl<'a> FastCdc<'a> {
82    /// Create a custom FastCDC instance
83    pub fn new(gear: &'a [u64; 256], min_size: u64, normal_size: u64, max_size: u64) -> Self {
84        Self {
85            gear,
86            min_size,
87            max_size,
88            normal_size,
89        }
90    }
91}
92
93impl<'a> ToChunkIncr for FastCdc<'a> {
94    type Incr = FastCdcIncr<'a>;
95
96    fn to_chunk_incr(&self) -> Self::Incr {
97        self.into()
98    }
99}
100
101impl<'a> From<&FastCdc<'a>> for FastCdcIncr<'a> {
102    fn from(params: &FastCdc<'a>) -> Self {
103        Self {
104            params: params.clone(),
105            state: Default::default(),
106        }
107    }
108}
109
110/// FastCdcIncr provides an incrimental interface to `FastCdc`
111///
112/// This impl does not buffer data passing through it (the FastCDC algorithm does not require
113/// look-back) making it very efficient.
114#[derive(Debug, Clone, PartialEq, Eq, Default)]
115pub struct FastCdcIncr<'a> {
116    params: FastCdc<'a>,
117    state: FastCdcState,
118}
119
120#[derive(Debug, Clone, PartialEq, Eq, Default)]
121pub struct FastCdcState {
122    /// Number of bytes we've "examined"
123    ///
124    /// varying state.
125    l: u64,
126
127    /// Current fingerprint
128    ///
129    /// varying state.
130    fp: Wrapping<u64>,
131}
132
133impl FastCdcState {
134    fn reset(&mut self) {
135        self.l = 0;
136        self.fp = Wrapping(0);
137    }
138
139    fn push(&mut self, params: &FastCdc<'_>, data: &[u8]) -> Option<usize> {
140        // global start/index
141        let mut gi = self.l;
142        // global end
143        let ge = data.len() as u64 + gi;
144
145        if ge <= params.min_size {
146            // No split, no processing of data, but we've "consumed" the bytes.
147            self.l = ge;
148            return None;
149        }
150
151        // skip elements prior to MIN_SIZE and track offset of new `data` in argument `data` for
152        // return value
153        let mut i = if gi <= params.min_size {
154            let skip = params.min_size - gi;
155            gi += skip;
156            skip
157        } else {
158            0
159        } as usize;
160
161        let mut fp = self.fp;
162
163        loop {
164            if i >= data.len() {
165                break;
166            }
167            if gi >= params.normal_size {
168                // go to next set of matches
169                break;
170            }
171
172            let v = data[i];
173            fp = (fp << 1) + Wrapping(params.gear[v as usize]);
174            if (fp.0 & MASK_S) == 0 {
175                self.reset();
176                return Some(i);
177            }
178
179            gi += 1;
180            i += 1;
181        }
182
183        loop {
184            if gi >= params.max_size {
185                // no match found, emit fixed match at MAX_SIZE
186                self.reset();
187                return Some(i);
188            }
189            if i >= data.len() {
190                break;
191            }
192
193            let v = data[i];
194            fp = (fp << 1) + Wrapping(params.gear[v as usize]);
195            if (fp.0 & MASK_L) == 0 {
196                self.reset();
197                return Some(i);
198            }
199
200            gi += 1;
201            i += 1;
202        }
203
204        // no match, but not at MAX_SIZE yet, so store context for next time.
205        self.fp = fp;
206        self.l = ge;
207
208        None
209    }
210}
211
212impl<'a> ChunkIncr for FastCdcIncr<'a> {
213    fn push(&mut self, src: &[u8]) -> Option<usize> {
214        self.state.push(&self.params, src)
215    }
216}