1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181
use crate::{hasher::*, hashmap::{BasicHashTable, ChainList}, Ranger};
pub(crate) struct TrgtMatcher{
compress_early_exit:usize,
chain_check: usize,
pub(crate) fwd_hash: u32,
pub(crate) fwd_pos: usize,
pub(crate) max_fwd_hash_pos: usize,
table: BasicHashTable,
chain: ChainList,
}
impl TrgtMatcher {
// pub(crate) fn hash_win_len(&self)->usize{
// self.hash_win_len
// }
// pub(crate) fn peek_next_pos(&self) -> usize{
// self.hasher.peek_next_pos()
// }
// pub(crate) fn peek_next_hash(&self) -> usize{
// self.hasher.peek_next_hash()
// }
// pub(crate) fn advance_and_store(&mut self) {
// //let start_pos = hasher.peek_next_pos();
// //dbg!(start_pos);
// self.hasher.next();
// self.store(self.hasher.peek_next_hash(),self.hasher.peek_next_pos());
// }
// pub(crate) fn seek(&mut self, pos:usize){
// debug_assert!(self.hasher.peek_next_pos() <= pos, "self.hasher.peek_next_pos({}) > pos({})",self.hasher.peek_next_pos(),pos);
// if let Some(_hash) = self.hasher.seek(pos) {
// //don't store the hash here. We seek before we query,
// // so we want to store using the advance_and_store method.
// // self.store(hash,pos)
// }
// }
///Returns (trgt_pos, post_match) post_match *includes* the hash_win_len.
pub fn find_best_trgt_match(&self,trgt:&[u8],min_match:usize)->Option<(usize,usize)>{
let cur_hash = self.fwd_hash as usize;
let table_pos = self.table.get(cur_hash)?;
let mut iter = std::iter::once(table_pos).chain(self.chain.iter_prev_starts(table_pos, self.fwd_pos,cur_hash)).filter(|start|start + 4 < self.fwd_pos);
let mut chain = if min_match > 4 {(self.chain_check/4).max(1)} else {self.chain_check};
let mut best = None;
let mut best_len = 0;
let mut _chain_len = 0;
let mut _collisions = 0;
loop {
if chain == 0{
break;
}
if let Some(start_pos) = iter.next() {
_chain_len += 1;
//first verify hash matches the data
let initial_match = trgt[start_pos..start_pos + 4]
.iter().zip(trgt[self.fwd_pos..self.fwd_pos + 4].iter())
.all(|(a,b)| a == b);
if !initial_match{
// dbg!(&trgt[start_pos..start_pos + hash_len], &trgt[cur_o_pos..cur_o_pos + hash_len],cur_o_pos,start_pos);
// panic!();
_collisions += 1;
continue;
}
// Extend forward
let match_end = start_pos + 4;
let trgt_end = self.fwd_pos + 4;
let src_remain = self.fwd_pos - match_end;
let trgt_remain = trgt.len() - trgt_end;
let post_match = (0..src_remain.min(trgt_remain)).take_while(|&i| {
trgt[match_end + i] == trgt[trgt_end + i]
}).count();
let total_post_match = post_match + 4;
if total_post_match > best_len{
best_len = total_post_match;
best = Some((start_pos,total_post_match));
if best_len >= self.compress_early_exit{
break;
}
}
chain -= 1;
}else{break;}
}
// if _collisions > 0{
// dbg!(_chain_len,_collisions,self.fwd_pos);
// if self.fwd_pos > 100{
// panic!();
// }
// }
// println!("MaxCheck: {}, Chain checked: {}, Fwd pos: {}, Collisions:{}, Best:{:?}",if min_match > 4 {(self.chain_check/4).max(1)} else {self.chain_check},_chain_len,self.fwd_pos,_collisions,best);
// std::thread::sleep(std::time::Duration::from_millis(100));
best
}
pub(crate) fn store(&mut self, hash:usize, pos:usize){
match self.table.insert(hash, pos){
Ok(None) => {},
Ok(Some(prev)) => {
self.chain.insert(hash, pos, prev);
},
Err((old_hash,prev_pos)) => {
self.chain.insert(old_hash, pos, prev_pos);
}
}
}
}
const DEFAULT_TRGT_WIN_SIZE: usize = 1 << 23;
const DEFAULT_PREV_SIZE: usize = 1 << 18;
///Configuration for the TrgtMatcher.
#[derive(Debug, Clone)]
pub struct TrgtMatcherConfig{
/// If the small match (in trgt) is >= than this we stop searching the chain for better matches and use this one.
pub compress_early_exit:usize,
/// Max number of entries to check in the chain during matching.
/// Larger value means more accurate matches but slower.
/// `compress_early_exit` stops checking the chain,
/// this value is the fallback in case the chain is long, and has no good matches.
pub chain_check: usize,
/// How many historical hashes to store if we find multiple start points for a given hash.
/// This memory is shared across all hashes. Leave blank for dynamic calculation.
pub prev_table_capacity: Option<usize>,
}
impl Default for TrgtMatcherConfig {
fn default() -> Self {
Self::comp_level(3)
}
}
impl TrgtMatcherConfig {
///Creates a new TrgtMatcherConfig with the given compression level.
/// level: The compression level to use. Must be between 0 and 9.
/// The higher the level the more accurate the matches but slower.
pub fn comp_level(level:usize)->Self{
assert!(level <= 9);
let compress_early_exit = Ranger::new(0..10, 6..=70).map(level);
let chain_check = Ranger::new(0..10, 1..=33).map(level);
Self { compress_early_exit, chain_check, prev_table_capacity: None}
}
pub fn with_table_capacity(mut self, table_capacity:usize)->Self{
self.prev_table_capacity = Some(table_capacity);
self
}
pub(crate) fn build(&mut self,trgt:&[u8],trgt_start_pos:usize)->TrgtMatcher{
let Self { compress_early_exit, chain_check, prev_table_capacity } = self;
let effective_len = trgt.len() - trgt_start_pos;
// self.prev_table_capacity = Some(self.prev_table_capacity
// .unwrap_or_else(||{
// let exact = max_unique_substrings_gt_hash_len(*win_size, effective_len, 1);
// exact.next_power_of_two() >> 1
// }));
prev_table_capacity.get_or_insert(DEFAULT_PREV_SIZE.min(effective_len.next_power_of_two()>>1));
//let table = BasicHashTable::new(DEFAULT_TRGT_WIN_SIZE.min((effective_len + (effective_len/2)).next_power_of_two() >> 1), self.prev_table_capacity.unwrap(),if *win_size>4{8}else{4});
let table = BasicHashTable::new(DEFAULT_TRGT_WIN_SIZE.min((effective_len + (effective_len/2)).next_power_of_two() >> 1), true);
let mut matcher = TrgtMatcher{
compress_early_exit: *compress_early_exit,
chain_check: *chain_check,
fwd_hash: 0,
fwd_pos: trgt_start_pos,
table,
chain: ChainList::new(self.prev_table_capacity.unwrap()),
max_fwd_hash_pos: trgt.len()-4,
};
if trgt_start_pos > 0 { //prefill with hash start positions.
let start = trgt_start_pos.saturating_sub(self.prev_table_capacity.unwrap());
let end = trgt_start_pos;
let mut hash = calculate_small_checksum(&trgt[start..]);
matcher.store(hash as usize, start);
for old_pos in start..end{
hash = update_small_checksum_fwd(hash, trgt[old_pos], trgt[old_pos + 4]);
matcher.store(hash as usize, old_pos + 1);
}
}
matcher
}
}