Skip to main content

hash_roll/
mii.rs

1#![cfg(feature = "mii")]
2use crate::{ChunkIncr, ToChunkIncr};
3
4/// C. Zhang et al., "MII: A Novel Content Defined Chunking Algorithm for Finding Incremental Data
5/// in Data Synchronization," in IEEE Access, vol. 7, pp. 86932-86945, 2019, doi:
6/// 10.1109/ACCESS.2019.2926195.
7///
8/// https://ieeexplore.ieee.org/abstract/document/8752387
9#[derive(Debug, Clone)]
10pub struct Mii {
11    w: u64,
12}
13
14impl Mii {
15    /// Create a new splitter with parameter `w`
16    ///
17    /// `w` is the number of "increments" (positive changes in byte value) after which we split the
18    /// input
19    ///
20    // TODO: determine distribution and expected size of chunks
21    //
22    // 1: P(curr > prev) = 0    (prev set to 0xff)
23    // 2: P(curr > prev) = 0.5  (prev and curr assumed to be randomly distributed)
24    // 3: P(curr > prev) |  t2 = ???
25    //    P(curr > prev) | !t2 = ???
26    pub fn with_w(w: u64) -> Self {
27        Self { w }
28    }
29}
30
31impl Default for Mii {
32    /// The window of 5 is used in the paper for the generated graphs
33    ///
34    /// It is compared against Rabin with a window of 7 and AE/LMC/RAM with a window of 700
35    fn default() -> Self {
36        Mii::with_w(5)
37    }
38}
39
40impl crate::Chunk for Mii {
41    type SearchState = MiiSearchState;
42
43    fn to_search_state(&self) -> Self::SearchState {
44        Into::<MiiIncr>::into(self).into()
45    }
46
47    fn find_chunk_edge(
48        &self,
49        state: &mut Self::SearchState,
50        data: &[u8],
51    ) -> (Option<usize>, usize) {
52        match state.push(data) {
53            Some(v) => {
54                state.reset();
55                (Some(v), v)
56            }
57            None => (None, data.len()),
58        }
59    }
60}
61
62impl From<&Mii> for MiiIncr {
63    fn from(src: &Mii) -> Self {
64        src.clone().into()
65    }
66}
67
68impl ToChunkIncr for Mii {
69    type Incr = MiiIncr;
70
71    fn to_chunk_incr(&self) -> Self::Incr {
72        self.into()
73    }
74}
75
76#[derive(Debug)]
77pub struct MiiSearchState {
78    incr: MiiIncr,
79}
80
81impl MiiSearchState {
82    fn reset(&mut self) {
83        self.incr.reset();
84    }
85}
86
87impl From<MiiIncr> for MiiSearchState {
88    fn from(incr: MiiIncr) -> Self {
89        Self { incr }
90    }
91}
92
93impl MiiSearchState {
94    fn push(&mut self, data: &[u8]) -> Option<usize> {
95        self.incr.push(data)
96    }
97}
98
99#[derive(Debug)]
100pub struct MiiIncr {
101    /// After this many increments, split the file
102    w: u64,
103
104    /// previous examined byte, if any
105    prev: u8,
106
107    /// number of times a byte was greater than the previous value
108    increment: u64,
109}
110
111impl From<Mii> for MiiIncr {
112    fn from(p: Mii) -> Self {
113        MiiIncr {
114            w: p.w,
115            // we use 0xff to ensure that the first examined byte does not trigger an increment
116            prev: 0xff,
117            increment: 0,
118        }
119    }
120}
121
122impl ChunkIncr for MiiIncr {
123    fn push(&mut self, input: &[u8]) -> Option<usize> {
124        for (i, b) in input.iter().cloned().enumerate() {
125            if b > self.prev {
126                self.increment += 1;
127                if self.increment == self.w {
128                    // this is a split
129                    self.increment = 0;
130                    self.prev = 0;
131                    return Some(i + 1);
132                }
133            } else {
134                self.increment = 0;
135            }
136            self.prev = b;
137        }
138
139        None
140    }
141}
142
143impl MiiIncr {
144    fn reset(&mut self) {
145        self.prev = 0xff;
146        self.increment = 0;
147    }
148}