smdiff_encoder/
src_matcher.rs

1
2use crate::{hasher::*, hashmap::BasicHashTable,Ranger};
3
4struct InnerConfig{
5    l_step:usize,
6    src_len:usize,
7    trgt_len:usize,
8    src_win_size:usize,
9    max_end_pos:usize,
10}
11
12pub(crate) struct SrcMatcher{
13    pub(crate) l_step:usize,
14    pub(crate) fwd_hash: usize,
15    pub(crate) fwd_pos: usize,
16    pub(crate) max_fwd_hash_pos:usize,
17    table: BasicHashTable,
18    //Window calc state
19    src_len:usize,
20    trgt_len:usize,
21    half_win_size:usize,
22    max_end_pos:usize,
23    cur_window_end:usize,
24    pub(crate) next_hash_pos:usize,
25    max_match_pos:usize,
26}
27
28impl SrcMatcher{
29
30    ///Returns (src_pos, pre_match, post_match) post_match *includes* the hash_win_len.
31    pub fn find_best_src_match(&mut self,src:&[u8],trgt:&[u8])->Option<(usize,usize,usize)>{
32        let table_pos = self.table.get(self.fwd_hash)?;
33        let src_pos = self.table_to_abs_pos(table_pos);
34        if let Some((pre_match,post_match)) = extend_src_match(src, src_pos, trgt, self.fwd_pos) {
35            let total_post_match = post_match + 9;
36            return Some((src_pos,pre_match,total_post_match));
37        }
38        None
39    }
40    /// Positions returned from the table are in table space, this converts them to absolute start positions.
41    /// In other words, the table_pos is multiplied by l_step to get the absolute position.
42    pub(crate) fn table_to_abs_pos(&self, table_pos:usize)->usize{
43        table_pos * self.l_step
44    }
45    fn abs_to_table_pos(&self, abs_pos:usize)->usize{
46        debug_assert!(abs_pos % self.l_step == 0, "abs_pos({}) is !divisible by l_step({})",abs_pos,self.l_step);
47        abs_pos / self.l_step
48    }
49    fn store(&mut self, hash:usize, abs_pos:usize){
50        debug_assert!(abs_pos % self.l_step == 0);
51        let table_pos = self.abs_to_table_pos(abs_pos);
52        match self.table.insert(hash, table_pos){
53            Ok(None) => {},
54            Ok(Some(_prev)) => {
55                //self.chain.insert(hash, table_pos, prev);
56            },
57            Err((_old_hash,_prev_pos)) => {
58                //self.chain.insert(old_hash, table_pos, prev_pos);
59            }
60        }
61    }
62}
63
64// I used to add them in forward order, and on the test files actually got better matches
65// However xd3 puts them in in reverse order, and logically it makes more sense.
66// Since we don't store hash collisions/duplicates we probably want to keep hashes closest to our current position.
67// Thus storing the early (closer) positions will ensure that they are not evicted by later positions.
68pub(crate) fn add_start_positions_to_matcher(matcher: &mut SrcMatcher, cur_o_pos: usize, src: &[u8]) {
69    if cur_o_pos < matcher.next_hash_pos{
70        return;
71    }
72    if let Some(range) = calculate_window_range(cur_o_pos, matcher.src_len, matcher.trgt_len, matcher.half_win_size, matcher.max_end_pos, matcher.cur_window_end, matcher.l_step,matcher.max_match_pos) {
73        //Put the new positions in, in reverse order
74        //Reverse, because later positions are less likely to be used.
75        //The hash table only keeps the last hash for a given hash
76        //So hash/pos nearest our current position we want to ensure we keep.
77        //By going in reverse, any collisions/duplicate starts will be evicting matches later in the src file.
78        //The idea is that similar files will have similar offsets.
79        //Very different files will always suffer from poor alignment and missing matches.
80        //That is why it is best to use TrgtMatcher as well as secondary compression and not rely on the SrcMatcher alone.
81        debug_assert!(range.start % matcher.l_step == 0, "range.start({}) must be divisible by l_step({})",range.start,matcher.l_step);
82        if range.end >= matcher.max_end_pos{
83            matcher.next_hash_pos = usize::MAX;
84        }else{
85            //?? Should this be changed?
86            //would a 1/4 of the way through our window size make more sense?
87            // 3/4 of the way through our window size?
88            // Would have to do with the file size differences and aligning between the two files based on matches.
89            matcher.next_hash_pos = cur_o_pos + (matcher.half_win_size);
90        }
91
92        if matcher.l_step >= 9 {
93            for pos in range.step_by(matcher.l_step).rev() {
94                let hash = calculate_large_checksum(&src[pos..pos + 9]);
95                matcher.store(hash, pos)
96            }
97        }else{
98            let aligned_last_hash = align(range.end.saturating_sub(9),matcher.l_step);
99            let mut hash = calculate_large_checksum(&src[aligned_last_hash..range.end]);
100            for pos in (range.start..aligned_last_hash).rev() {
101                hash = update_large_checksum_bwd(hash, src[pos+9], src[pos]);
102                if pos % matcher.l_step == 0 {
103                    matcher.store(hash, pos);
104                }
105            }
106            // for pos in (0..aligned_last_hash).rev().step_by(matcher.l_step).skip(1) {
107            //     for inner_pos in (0..matcher.l_step).rev() {
108            //         let current_pos = pos + inner_pos;
109            //         hash = update_large_checksum_bwd(hash, src[current_pos + 9], src[current_pos]);
110            //     }
111            //     matcher.store(hash, pos);
112            // }
113
114        }
115    }
116}
117const DEFAULT_SRC_WIN_SIZE: usize = 1 << 26;
118const MIN_SRC_WIN_SIZE: usize = 1 << 20;
119const _HASH_CHUNK_SIZE: usize = 1 << 23;
120//const DEFAULT_PREV_SIZE: usize = 1 << 18;
121
122///Configuration for the SrcMatcher.
123#[derive(Debug, Clone)]
124pub struct SrcMatcherConfig{
125    /// How much to advance the Large Hash between storing a src hash.
126    /// Larger value means faster, but might miss good matches.
127    pub l_step: usize,
128    /// The maximum size of the source window.
129    /// This is how many bytes to assess and store hashes for.
130    /// Larger values consider more matches, but might hash excessively slowing down encoder.
131    /// Leave blank for dynamic calculation.
132    pub max_src_win_size: Option<usize>,
133}
134
135impl Default for SrcMatcherConfig {
136    fn default() -> Self {
137        Self::comp_level(3)
138    }
139}
140impl SrcMatcherConfig {
141    ///Creates a new SrcMatcherConfig with the given parameters.
142    /// l_step: How much to advance the Large Hash between storing a src hash.
143    /// max_src_win_size: The maximum size of the source window.
144    pub fn new(l_step: usize,max_src_win_size:Option<usize>) -> Self {
145        Self { l_step, max_src_win_size}
146    }
147    ///Creates a new SrcMatcherConfig with the given compression level.
148    /// level: The compression level to use. Must be between 0 and 9.
149    /// The higher the level the more accurate the matches but slower.
150    pub fn comp_level(level:usize)->Self{
151        assert!(level <= 9);
152        let l_step = Ranger::new(0..10, 26..=2).map(level);
153        Self { l_step, max_src_win_size: None}
154    }
155    fn make_inner_config(&mut self, src_len: usize,trgt_len:usize)->InnerConfig{
156        self.l_step = self.l_step.max(1);
157
158        self.max_src_win_size = Some(self
159            .max_src_win_size
160            .map(|s| s.next_power_of_two().max(MIN_SRC_WIN_SIZE))
161            .unwrap_or(
162                DEFAULT_SRC_WIN_SIZE
163                //calculate_default_win_size(src_len, trgt_len,None)
164            ));
165        InnerConfig{
166            l_step:self.l_step,
167            src_len,
168            trgt_len,
169            src_win_size:self.max_src_win_size.unwrap(),
170            max_end_pos:align(src_len.saturating_sub(9),self.l_step),
171        }
172    }
173    pub(crate) fn build(&mut self,src:&[u8],trgt_start_pos:usize,trgt:&[u8])->SrcMatcher{
174        let trgt_len = trgt.len();
175        let InnerConfig { l_step, src_len, trgt_len, src_win_size, max_end_pos } = self.make_inner_config(src.len(),trgt_len);
176        let max_fwd_hash_pos = trgt.len().saturating_sub(9);
177        let (fwd_hash,fwd_pos) = if trgt_start_pos < max_fwd_hash_pos {
178            (calculate_large_checksum(&trgt[trgt_start_pos..trgt_start_pos+9]),trgt_start_pos)
179        }else{
180            (0,max_fwd_hash_pos)
181        };
182        //regardless of win_size given, we do not need to store more than the entire src file.
183        let table_win_effective = (src_len.next_power_of_two() >> 1).min(src_win_size);
184        let table = BasicHashTable::new(table_win_effective/l_step, false);
185        let mut matcher = SrcMatcher{
186            table, src_len, trgt_len, max_end_pos,max_fwd_hash_pos,
187            fwd_hash,
188            fwd_pos,
189            l_step,
190            half_win_size: src_win_size>>1,
191            cur_window_end: 0,
192            next_hash_pos: 0,
193            max_match_pos: trgt_start_pos,
194        };
195        //prefill with hash start positions.
196        add_start_positions_to_matcher(&mut matcher, trgt_start_pos, src);
197        matcher
198    }
199}
200
201#[inline]
202fn align(pos:usize,l_step:usize)->usize{
203    pos - (pos % l_step)
204}
205
206///Returns the (pre_match, post_match) for the given src and trgt data.
207///None if the hash was a collision
208pub(crate) fn extend_src_match(src:&[u8],src_start:usize,trgt:&[u8],trgt_start:usize)->Option<(usize,usize)>{
209    //first verify hash matches the data
210    let initial_match = src[src_start..src_start + 9]
211        .iter().zip(trgt[trgt_start..trgt_start + 9].iter())
212        .all(|(a,b)| a == b);
213    if !initial_match{
214        return None;
215    }
216    // Extend backward
217    let min_offset = src_start.min(trgt_start);
218    let pre_match = if min_offset > 1 {
219        (1..min_offset).take_while(|&i| {
220            src[src_start - i] == trgt[trgt_start - i]
221        }).count()
222    }else{0};
223
224    // Extend forward
225    let src_end = src_start + 9;
226    let trgt_end = trgt_start + 9;
227    let src_remain = src.len() - src_end;
228    let trgt_remain = trgt.len() - trgt_end;
229    let post_match = (0..src_remain.min(trgt_remain)).take_while(|&i| {
230        src[src_end + i] == trgt[trgt_end + i]
231    }).count();
232    Some((pre_match,post_match))
233}
234
235//Thought I could cut down encoding time (which it does), but this misses lots of good matches on dissimilar files of similar length
236#[allow(dead_code)]
237fn calculate_default_win_size(src_len: usize, trgt_len: usize,max_win_size:Option<usize>) -> usize {
238    let mut win_size = (src_len).abs_diff(trgt_len).next_power_of_two();
239    if win_size <= MIN_SRC_WIN_SIZE {
240        win_size = win_size + MIN_SRC_WIN_SIZE;
241    }
242    let upper_bound = src_len.next_power_of_two().min(max_win_size.map(|a|a.next_power_of_two()).unwrap_or(DEFAULT_SRC_WIN_SIZE));
243    win_size.min(upper_bound)
244}
245
246#[inline]
247fn calculate_window_range(
248    cur_o_pos: usize,
249    src_len: usize,
250    trgt_len: usize,
251    half_win_size: usize,
252    max_end_pos: usize,
253    cur_window_end: usize,
254    l_step: usize,
255    max_match_end:usize,
256) -> Option<std::ops::Range<usize>> {
257    //min and max cases
258    if cur_o_pos < half_win_size {
259        return Some(0..(half_win_size<<1).min(max_end_pos));
260    }else if cur_window_end >= max_end_pos{
261        return None;
262    }
263    //else we are going to slide the window based on the current output position
264
265    //First find the scaled mid point, or the 'equivalent position in the src file'
266    //We use this as our 'input position' to calculate the window position.
267    //Note on `.min(max_match_end)`:
268    // If our longest match has exceeded even this scaled position, use that instead.
269    // We do not store already matched start positions.
270    // That is, we our greedy at moving the src matcher fwd.
271    let scaled_src_pos = cur_o_pos * src_len / trgt_len;
272
273    //We add our half window to the equivalent position.
274    let scaled_end = align((scaled_src_pos + half_win_size).min(max_end_pos),l_step);
275    if scaled_end <= cur_window_end || scaled_end <= max_match_end {//nothing more to hash yet
276        //this will be encountered often when the trgt file is (significantly) larger than the src file.
277        return None;
278    }
279    //The max amt we logically could hash
280    //We need this in case we move an entire window size forward.
281    //We re-center our window based on the scaled src position.
282    let max_diff_start = scaled_end.saturating_sub(half_win_size<<1);
283    let scaled_start = align(cur_window_end.max(max_match_end).max(max_diff_start),l_step);
284    Some(scaled_start..scaled_end)
285}
286
287#[cfg(test)]
288mod test_super {
289    use super::*;
290
291    #[test]
292    fn test_calculate_window_range_lrg_trgt() {
293        let half_win_size = 256;
294        let l_step = 2;
295        let src_len = 1000;
296        let trgt_len = 2000;
297        let max_end_pos = 994;
298
299        // Initial fast forward
300        let result = calculate_window_range(10, src_len, trgt_len, half_win_size, max_end_pos, 0, l_step, 0);
301        assert_eq!(result.unwrap(), 0..512, "Initial fast forward: Incorrect range");
302
303        let result = calculate_window_range(10, src_len, trgt_len, 512, max_end_pos, 0, l_step, 0);
304        assert!(result.is_some(), "Initial fast forward: Expected Some(0..1024), but got {:?}", result);
305        assert_eq!(result.unwrap(), 0..max_end_pos, "Initial fast forward: Incorrect range");
306
307        // End of file
308        let result = calculate_window_range(900, src_len, trgt_len, half_win_size, max_end_pos, 1000, l_step, 0);
309        assert!(result.is_none(), "End of file: Expected None, but got {:?}", result);
310
311        // Nothing more to hash
312        let result = calculate_window_range(500, src_len, trgt_len, half_win_size, max_end_pos, 700, l_step, 0);
313        assert!(result.is_none(), "Nothing to hash: Expected None, but got {:?}", result);
314
315        // Normal advancement
316        let result = calculate_window_range(1250, src_len, trgt_len, half_win_size, max_end_pos, 512, l_step, 0);
317        assert_eq!(result.unwrap(), 512..880, "Normal advancement: Incorrect range");
318
319        // Scaled end too far
320        let result = calculate_window_range(400, src_len, trgt_len, half_win_size, max_end_pos, 300, l_step, 1000);
321        assert!(result.is_none(), "Scaled end too far: Expected None, but got {:?}", result);
322
323        // Match is ahead of position
324        let result = calculate_window_range(400, src_len, trgt_len, half_win_size, max_end_pos, 300, l_step, 700);
325        assert!(result.is_none(), "Max Match Exceeds Position: Expected None, but got {:?}", result);
326    }
327
328    #[test]
329    fn test_calculate_default_win_size_various_conditions() {
330        // Testing when source length is much larger than target length
331        let src_len = 2_000_000;
332        let trgt_len = 100;
333        assert_eq!(calculate_default_win_size(src_len, trgt_len,None), src_len.next_power_of_two(), "test_large_source_small_target");
334
335        // Testing when target length is much larger than source length
336        let src_len = 100;
337        let trgt_len = 2_000_000;
338        assert_eq!(calculate_default_win_size(src_len, trgt_len,None), src_len.next_power_of_two(), "test_small_source_large_target");
339
340        // Testing when source and target lengths are the same
341        let src_len = 100_000_000;
342        let trgt_len = 90_000_000;
343        assert_eq!(calculate_default_win_size(src_len, trgt_len,None), 10_000_000usize.next_power_of_two(), "test_different_lengths");
344
345        // Testing when the difference is below MIN_SRC_WIN_SIZE
346        let src_len = 100_000_000;
347        let trgt_len = (src_len - MIN_SRC_WIN_SIZE) + 10;
348        assert_eq!(calculate_default_win_size(src_len, trgt_len,None), (src_len - trgt_len).next_power_of_two() + MIN_SRC_WIN_SIZE, "test_diff_below_min_src_win_size");
349
350        // Very large difference
351        let src_len = 100_000_000;
352        let trgt_len = 25_000_000;
353        assert_eq!(calculate_default_win_size(src_len, trgt_len,None), DEFAULT_SRC_WIN_SIZE, "test_source_below_min_src_win_size");
354    }
355
356}