smdiff_encoder/
trgt_matcher.rs

1use crate::{hasher::*, hashmap::{BasicHashTable, ChainList}, Ranger};
2
3
4
5pub(crate) struct TrgtMatcher{
6    compress_early_exit:usize,
7    chain_check: usize,
8    pub(crate) fwd_hash: u32,
9    pub(crate) fwd_pos: usize,
10    pub(crate) max_fwd_hash_pos: usize,
11    table: BasicHashTable,
12    chain: ChainList,
13}
14
15impl TrgtMatcher {
16    // pub(crate) fn hash_win_len(&self)->usize{
17    //     self.hash_win_len
18    // }
19    // pub(crate) fn peek_next_pos(&self) -> usize{
20    //     self.hasher.peek_next_pos()
21    // }
22    // pub(crate) fn peek_next_hash(&self) -> usize{
23    //     self.hasher.peek_next_hash()
24    // }
25    // pub(crate) fn advance_and_store(&mut self) {
26    //     //let start_pos = hasher.peek_next_pos();
27    //     //dbg!(start_pos);
28    //     self.hasher.next();
29    //     self.store(self.hasher.peek_next_hash(),self.hasher.peek_next_pos());
30    // }
31    // pub(crate) fn seek(&mut self, pos:usize){
32    //     debug_assert!(self.hasher.peek_next_pos() <= pos, "self.hasher.peek_next_pos({}) > pos({})",self.hasher.peek_next_pos(),pos);
33    //     if let Some(_hash) = self.hasher.seek(pos) {
34    //         //don't store the hash here. We seek before we query,
35    //         // so we want to store using the advance_and_store method.
36    //         // self.store(hash,pos)
37    //     }
38    // }
39    ///Returns (trgt_pos, post_match) post_match *includes* the hash_win_len.
40    pub fn find_best_trgt_match(&self,trgt:&[u8],min_match:usize)->Option<(usize,usize)>{
41        let cur_hash = self.fwd_hash as usize;
42        let table_pos = self.table.get(cur_hash)?;
43        let mut iter = std::iter::once(table_pos).chain(self.chain.iter_prev_starts(table_pos, self.fwd_pos,cur_hash)).filter(|start|start + 4 < self.fwd_pos);
44        let mut chain = if min_match > 4 {(self.chain_check/4).max(1)} else {self.chain_check};
45        let mut best = None;
46        let mut best_len = 0;
47        let mut _chain_len = 0;
48        let mut _collisions = 0;
49        loop {
50            if chain == 0{
51                break;
52            }
53            if let Some(start_pos) = iter.next() {
54                _chain_len += 1;
55                //first verify hash matches the data
56                let initial_match = trgt[start_pos..start_pos + 4]
57                .iter().zip(trgt[self.fwd_pos..self.fwd_pos + 4].iter())
58                .all(|(a,b)| a == b);
59                if !initial_match{
60                    // dbg!(&trgt[start_pos..start_pos + hash_len], &trgt[cur_o_pos..cur_o_pos + hash_len],cur_o_pos,start_pos);
61                    // panic!();
62                    _collisions += 1;
63                    continue;
64                }
65
66                // Extend forward
67                let match_end = start_pos + 4;
68                let trgt_end = self.fwd_pos + 4;
69                let src_remain = self.fwd_pos - match_end;
70                let trgt_remain = trgt.len() - trgt_end;
71                let post_match = (0..src_remain.min(trgt_remain)).take_while(|&i| {
72                    trgt[match_end + i] == trgt[trgt_end + i]
73                }).count();
74                let total_post_match = post_match + 4;
75                if total_post_match > best_len{
76                    best_len = total_post_match;
77                    best = Some((start_pos,total_post_match));
78                    if best_len >= self.compress_early_exit{
79                        break;
80                    }
81                }
82                chain -= 1;
83            }else{break;}
84
85        }
86        // if _collisions > 0{
87        //     dbg!(_chain_len,_collisions,self.fwd_pos);
88        //     if self.fwd_pos > 100{
89        //         panic!();
90        //     }
91        // }
92        // println!("MaxCheck: {}, Chain checked: {}, Fwd pos: {}, Collisions:{}, Best:{:?}",if min_match > 4 {(self.chain_check/4).max(1)} else {self.chain_check},_chain_len,self.fwd_pos,_collisions,best);
93        // std::thread::sleep(std::time::Duration::from_millis(100));
94        best
95    }
96    pub(crate) fn store(&mut self, hash:usize, pos:usize){
97        match self.table.insert(hash, pos){
98            Ok(None) => {},
99            Ok(Some(prev)) => {
100                self.chain.insert(hash, pos, prev);
101            },
102            Err((old_hash,prev_pos)) => {
103                self.chain.insert(old_hash, pos, prev_pos);
104            }
105        }
106    }
107}
108
109
110
111
112
113const DEFAULT_TRGT_WIN_SIZE: usize = 1 << 23;
114const DEFAULT_PREV_SIZE: usize = 1 << 18;
115///Configuration for the TrgtMatcher.
116#[derive(Debug, Clone)]
117pub struct TrgtMatcherConfig{
118    /// If the small match (in trgt) is >= than this we stop searching the chain for better matches and use this one.
119    pub compress_early_exit:usize,
120    /// Max number of entries to check in the chain during matching.
121    /// Larger value means more accurate matches but slower.
122    /// `compress_early_exit` stops checking the chain,
123    /// this value is the fallback in case the chain is long, and has no good matches.
124    pub chain_check: usize,
125    /// How many historical hashes to store if we find multiple start points for a given hash.
126    /// This memory is shared across all hashes. Leave blank for dynamic calculation.
127    pub prev_table_capacity: Option<usize>,
128}
129
130impl Default for TrgtMatcherConfig {
131    fn default() -> Self {
132        Self::comp_level(3)
133    }
134}
135impl TrgtMatcherConfig {
136    ///Creates a new TrgtMatcherConfig with the given compression level.
137    /// level: The compression level to use. Must be between 0 and 9.
138    /// The higher the level the more accurate the matches but slower.
139    pub fn comp_level(level:usize)->Self{
140        assert!(level <= 9);
141        let compress_early_exit = Ranger::new(0..10, 6..=70).map(level);
142        let chain_check = Ranger::new(0..10, 1..=33).map(level);
143        Self { compress_early_exit, chain_check, prev_table_capacity: None}
144    }
145    pub fn with_table_capacity(mut self, table_capacity:usize)->Self{
146        self.prev_table_capacity = Some(table_capacity);
147        self
148    }
149    pub(crate) fn build(&mut self,trgt:&[u8],trgt_start_pos:usize)->TrgtMatcher{
150        let Self { compress_early_exit, chain_check, prev_table_capacity } = self;
151        let effective_len = trgt.len() - trgt_start_pos;
152        // self.prev_table_capacity =  Some(self.prev_table_capacity
153        //     .unwrap_or_else(||{
154        //         let exact = max_unique_substrings_gt_hash_len(*win_size, effective_len, 1);
155        //         exact.next_power_of_two() >> 1
156        //     }));
157        prev_table_capacity.get_or_insert(DEFAULT_PREV_SIZE.min(effective_len.next_power_of_two()>>1));
158        //let table = BasicHashTable::new(DEFAULT_TRGT_WIN_SIZE.min((effective_len + (effective_len/2)).next_power_of_two() >> 1), self.prev_table_capacity.unwrap(),if *win_size>4{8}else{4});
159        let table = BasicHashTable::new(DEFAULT_TRGT_WIN_SIZE.min((effective_len + (effective_len/2)).next_power_of_two() >> 1), true);
160        let mut matcher = TrgtMatcher{
161            compress_early_exit: *compress_early_exit,
162            chain_check: *chain_check,
163            fwd_hash: 0,
164            fwd_pos: trgt_start_pos,
165            table,
166            chain: ChainList::new(self.prev_table_capacity.unwrap()),
167            max_fwd_hash_pos: trgt.len().saturating_sub(4),
168        };
169        if trgt_start_pos > 0 { //prefill with hash start positions.
170            let start = trgt_start_pos.saturating_sub(self.prev_table_capacity.unwrap());
171            let end = trgt_start_pos;
172            let mut hash = calculate_small_checksum(&trgt[start..]);
173            matcher.store(hash as usize, start);
174            for old_pos in start..end{
175                hash = update_small_checksum_fwd(hash, trgt[old_pos], trgt[old_pos + 4]);
176                matcher.store(hash as usize, old_pos + 1);
177            }
178        }
179        matcher
180    }
181}