smdiff_encoder/trgt_matcher.rs
1use crate::{hasher::*, hashmap::{BasicHashTable, ChainList}, Ranger};
2
3
4
5pub(crate) struct TrgtMatcher{
6 compress_early_exit:usize,
7 chain_check: usize,
8 pub(crate) fwd_hash: u32,
9 pub(crate) fwd_pos: usize,
10 pub(crate) max_fwd_hash_pos: usize,
11 table: BasicHashTable,
12 chain: ChainList,
13}
14
15impl TrgtMatcher {
16 // pub(crate) fn hash_win_len(&self)->usize{
17 // self.hash_win_len
18 // }
19 // pub(crate) fn peek_next_pos(&self) -> usize{
20 // self.hasher.peek_next_pos()
21 // }
22 // pub(crate) fn peek_next_hash(&self) -> usize{
23 // self.hasher.peek_next_hash()
24 // }
25 // pub(crate) fn advance_and_store(&mut self) {
26 // //let start_pos = hasher.peek_next_pos();
27 // //dbg!(start_pos);
28 // self.hasher.next();
29 // self.store(self.hasher.peek_next_hash(),self.hasher.peek_next_pos());
30 // }
31 // pub(crate) fn seek(&mut self, pos:usize){
32 // debug_assert!(self.hasher.peek_next_pos() <= pos, "self.hasher.peek_next_pos({}) > pos({})",self.hasher.peek_next_pos(),pos);
33 // if let Some(_hash) = self.hasher.seek(pos) {
34 // //don't store the hash here. We seek before we query,
35 // // so we want to store using the advance_and_store method.
36 // // self.store(hash,pos)
37 // }
38 // }
39 ///Returns (trgt_pos, post_match) post_match *includes* the hash_win_len.
40 pub fn find_best_trgt_match(&self,trgt:&[u8],min_match:usize)->Option<(usize,usize)>{
41 let cur_hash = self.fwd_hash as usize;
42 let table_pos = self.table.get(cur_hash)?;
43 let mut iter = std::iter::once(table_pos).chain(self.chain.iter_prev_starts(table_pos, self.fwd_pos,cur_hash)).filter(|start|start + 4 < self.fwd_pos);
44 let mut chain = if min_match > 4 {(self.chain_check/4).max(1)} else {self.chain_check};
45 let mut best = None;
46 let mut best_len = 0;
47 let mut _chain_len = 0;
48 let mut _collisions = 0;
49 loop {
50 if chain == 0{
51 break;
52 }
53 if let Some(start_pos) = iter.next() {
54 _chain_len += 1;
55 //first verify hash matches the data
56 let initial_match = trgt[start_pos..start_pos + 4]
57 .iter().zip(trgt[self.fwd_pos..self.fwd_pos + 4].iter())
58 .all(|(a,b)| a == b);
59 if !initial_match{
60 // dbg!(&trgt[start_pos..start_pos + hash_len], &trgt[cur_o_pos..cur_o_pos + hash_len],cur_o_pos,start_pos);
61 // panic!();
62 _collisions += 1;
63 continue;
64 }
65
66 // Extend forward
67 let match_end = start_pos + 4;
68 let trgt_end = self.fwd_pos + 4;
69 let src_remain = self.fwd_pos - match_end;
70 let trgt_remain = trgt.len() - trgt_end;
71 let post_match = (0..src_remain.min(trgt_remain)).take_while(|&i| {
72 trgt[match_end + i] == trgt[trgt_end + i]
73 }).count();
74 let total_post_match = post_match + 4;
75 if total_post_match > best_len{
76 best_len = total_post_match;
77 best = Some((start_pos,total_post_match));
78 if best_len >= self.compress_early_exit{
79 break;
80 }
81 }
82 chain -= 1;
83 }else{break;}
84
85 }
86 // if _collisions > 0{
87 // dbg!(_chain_len,_collisions,self.fwd_pos);
88 // if self.fwd_pos > 100{
89 // panic!();
90 // }
91 // }
92 // println!("MaxCheck: {}, Chain checked: {}, Fwd pos: {}, Collisions:{}, Best:{:?}",if min_match > 4 {(self.chain_check/4).max(1)} else {self.chain_check},_chain_len,self.fwd_pos,_collisions,best);
93 // std::thread::sleep(std::time::Duration::from_millis(100));
94 best
95 }
96 pub(crate) fn store(&mut self, hash:usize, pos:usize){
97 match self.table.insert(hash, pos){
98 Ok(None) => {},
99 Ok(Some(prev)) => {
100 self.chain.insert(hash, pos, prev);
101 },
102 Err((old_hash,prev_pos)) => {
103 self.chain.insert(old_hash, pos, prev_pos);
104 }
105 }
106 }
107}
108
109
110
111
112
113const DEFAULT_TRGT_WIN_SIZE: usize = 1 << 23;
114const DEFAULT_PREV_SIZE: usize = 1 << 18;
115///Configuration for the TrgtMatcher.
116#[derive(Debug, Clone)]
117pub struct TrgtMatcherConfig{
118 /// If the small match (in trgt) is >= than this we stop searching the chain for better matches and use this one.
119 pub compress_early_exit:usize,
120 /// Max number of entries to check in the chain during matching.
121 /// Larger value means more accurate matches but slower.
122 /// `compress_early_exit` stops checking the chain,
123 /// this value is the fallback in case the chain is long, and has no good matches.
124 pub chain_check: usize,
125 /// How many historical hashes to store if we find multiple start points for a given hash.
126 /// This memory is shared across all hashes. Leave blank for dynamic calculation.
127 pub prev_table_capacity: Option<usize>,
128}
129
130impl Default for TrgtMatcherConfig {
131 fn default() -> Self {
132 Self::comp_level(3)
133 }
134}
135impl TrgtMatcherConfig {
136 ///Creates a new TrgtMatcherConfig with the given compression level.
137 /// level: The compression level to use. Must be between 0 and 9.
138 /// The higher the level the more accurate the matches but slower.
139 pub fn comp_level(level:usize)->Self{
140 assert!(level <= 9);
141 let compress_early_exit = Ranger::new(0..10, 6..=70).map(level);
142 let chain_check = Ranger::new(0..10, 1..=33).map(level);
143 Self { compress_early_exit, chain_check, prev_table_capacity: None}
144 }
145 pub fn with_table_capacity(mut self, table_capacity:usize)->Self{
146 self.prev_table_capacity = Some(table_capacity);
147 self
148 }
149 pub(crate) fn build(&mut self,trgt:&[u8],trgt_start_pos:usize)->TrgtMatcher{
150 let Self { compress_early_exit, chain_check, prev_table_capacity } = self;
151 let effective_len = trgt.len() - trgt_start_pos;
152 // self.prev_table_capacity = Some(self.prev_table_capacity
153 // .unwrap_or_else(||{
154 // let exact = max_unique_substrings_gt_hash_len(*win_size, effective_len, 1);
155 // exact.next_power_of_two() >> 1
156 // }));
157 prev_table_capacity.get_or_insert(DEFAULT_PREV_SIZE.min(effective_len.next_power_of_two()>>1));
158 //let table = BasicHashTable::new(DEFAULT_TRGT_WIN_SIZE.min((effective_len + (effective_len/2)).next_power_of_two() >> 1), self.prev_table_capacity.unwrap(),if *win_size>4{8}else{4});
159 let table = BasicHashTable::new(DEFAULT_TRGT_WIN_SIZE.min((effective_len + (effective_len/2)).next_power_of_two() >> 1), true);
160 let mut matcher = TrgtMatcher{
161 compress_early_exit: *compress_early_exit,
162 chain_check: *chain_check,
163 fwd_hash: 0,
164 fwd_pos: trgt_start_pos,
165 table,
166 chain: ChainList::new(self.prev_table_capacity.unwrap()),
167 max_fwd_hash_pos: trgt.len().saturating_sub(4),
168 };
169 if trgt_start_pos > 0 { //prefill with hash start positions.
170 let start = trgt_start_pos.saturating_sub(self.prev_table_capacity.unwrap());
171 let end = trgt_start_pos;
172 let mut hash = calculate_small_checksum(&trgt[start..]);
173 matcher.store(hash as usize, start);
174 for old_pos in start..end{
175 hash = update_small_checksum_fwd(hash, trgt[old_pos], trgt[old_pos + 4]);
176 matcher.store(hash as usize, old_pos + 1);
177 }
178 }
179 matcher
180 }
181}