use crate::{hasher::*, src_matcher::{add_start_positions_to_matcher, SrcMatcherConfig}, trgt_matcher::TrgtMatcherConfig};
#[allow(unused)]
#[derive(Copy,Clone,Debug)]
pub enum LargerTrgtNaiveTests{
Prepend,
Append,
PrependAppend,
AppendPrepend,
}
impl Default for LargerTrgtNaiveTests {
fn default() -> Self {
LargerTrgtNaiveTests::AppendPrepend
}
}
#[derive(Debug,Clone)]
pub(crate) enum InnerOp{
MatchSrc{start:usize,length:usize,o_pos:usize},
MatchTrgt{start:usize,length:usize,o_pos:usize},
Run{byte:u8,length:usize,o_pos:usize},
}
impl InnerOp {
pub(crate) fn o_pos(&self)->&usize{
match self {
InnerOp::MatchSrc{o_pos,..} => o_pos,
InnerOp::MatchTrgt{o_pos,..} => o_pos,
InnerOp::Run{o_pos,..} => o_pos,
}
}
pub(crate) fn len(&self)->&usize{
match self {
InnerOp::MatchSrc{length, ..} => length,
InnerOp::MatchTrgt{length,..} => length,
InnerOp::Run{length,..} => length,
}
}
pub(crate) fn set_len(&mut self,new_len:usize){
assert!(new_len <= *self.len(), "Cannot increase length of InnerOp");
match self {
InnerOp::MatchSrc{length,..} => *length = new_len,
InnerOp::MatchTrgt{length,..} => *length = new_len,
InnerOp::Run{length,..} => *length = new_len,
}
}
pub(crate) fn set_o_pos(&mut self,new_pos:usize){
let cur_pos = *self.o_pos();
let cur_end = cur_pos + *self.len();
assert!(new_pos >= cur_pos, "Cannot decrease position of InnerOp");
let diff = new_pos - cur_pos;
match self {
InnerOp::MatchSrc { start, length, o_pos } => {
*start += diff;
*o_pos = new_pos;
*length -= diff;
},
InnerOp::MatchTrgt { start, length, o_pos } => {
*start += diff;
*o_pos = new_pos;
*length -= diff;
},
InnerOp::Run { length, o_pos, .. } => {
*o_pos = new_pos;
*length -= diff;
},
}
assert_eq!(self.o_pos() + self.len(), cur_end, "End position of InnerOp is not maintained");
}
}
pub(crate) fn encode_inner(config:&mut GenericEncoderConfig,src:&[u8],trgt:&[u8])->Vec<InnerOp>{
let naive_tests = config.naive_tests;
let trgt_len = trgt.len();
if (config.match_src.is_none() && config.match_trgt.is_none())
|| (src.len() == 0 && config.match_trgt.is_none())
|| trgt_len == 0 {
return vec![];
}
let src_len = src.len();
let mut ops = Vec::new();
let mut max_trgt_match_len = trgt_len;
let mut cur_o_pos = 0;
if trgt_len > src_len && naive_tests.is_some(){ let naive_tests = naive_tests.unwrap();
let handle_start_match = |segs: &mut Vec<InnerOp>, cur_o: &mut usize| {
if trgt.starts_with(src) {
segs.push(InnerOp::MatchSrc {start: 0,length: src.len(),o_pos: 0});
*cur_o = src.len();
true
} else {false}
};
let handle_end_match = |max_t_len: &mut usize| {
if trgt.ends_with(src) {*max_t_len = trgt_len - src_len;true} else {false}
};
match naive_tests {
LargerTrgtNaiveTests::Append => {handle_start_match(&mut ops, &mut cur_o_pos);},
LargerTrgtNaiveTests::Prepend => {handle_end_match(&mut max_trgt_match_len);},
LargerTrgtNaiveTests::AppendPrepend => {
if !handle_start_match(&mut ops, &mut cur_o_pos){handle_end_match(&mut max_trgt_match_len);}
}
LargerTrgtNaiveTests::PrependAppend => {
if !handle_end_match(&mut max_trgt_match_len){handle_start_match(&mut ops, &mut cur_o_pos);}
}
}
}
let _start = std::time::Instant::now();
let mut trgt_matcher = config.match_trgt.as_mut().map(|c| c.build(trgt, cur_o_pos));
let mut src_matcher = config.match_src.as_mut().map(|c| c.build(src, cur_o_pos, trgt));
let lazy_escape_len = config.lazy_escape_len.unwrap_or(90);
let min_match_value = 4;
let mut min_match= min_match_value;
let mut run_len = 0;
let mut run_byte = 0;
let mut state = EncoderState::StartNewMatch;
let _start = std::time::Instant::now();
loop {
if cur_o_pos + min_match_value >= max_trgt_match_len {
break;
}
match state{
EncoderState::StartNewMatch => {
run_len = find_initial_run_len(&trgt[cur_o_pos..], min_match_value, &mut run_byte);
if let Some(matcher) = src_matcher.as_mut(){
if matcher.next_hash_pos <= cur_o_pos{
add_start_positions_to_matcher(matcher, cur_o_pos, src)
}
if matcher.fwd_pos < matcher.max_fwd_hash_pos{
if matcher.fwd_pos + 9 > cur_o_pos{
for old_pos in matcher.fwd_pos..cur_o_pos{
matcher.fwd_hash = update_large_checksum_fwd(matcher.fwd_hash, trgt[old_pos], trgt[old_pos+9]);
}
}else{
matcher.fwd_hash = calculate_large_checksum(&trgt[cur_o_pos..cur_o_pos+9]);
}
matcher.fwd_pos = cur_o_pos;
}
};
if let Some(matcher) = trgt_matcher.as_mut(){
if matcher.fwd_pos < matcher.max_fwd_hash_pos{
if matcher.fwd_pos + 4 > cur_o_pos{
for old_pos in matcher.fwd_pos..cur_o_pos{
matcher.fwd_hash = update_small_checksum_fwd(matcher.fwd_hash, trgt[old_pos], trgt[old_pos+4]);
}
}else{
matcher.fwd_hash = calculate_small_checksum(&trgt[cur_o_pos..cur_o_pos+4]);
}
matcher.fwd_pos = cur_o_pos;
}
}
let last_match_end_pos = ops.last().map(|x|x.o_pos()+x.len()).unwrap_or(0);
min_match = if last_match_end_pos > cur_o_pos {
min_match_value.max(1 + last_match_end_pos - cur_o_pos)
} else {min_match_value};
state = EncoderState::TryMatch;
},
EncoderState::TryMatch => {
let remaining_trgt = &trgt[cur_o_pos..];
if run_len == min_match_value{
remaining_trgt[min_match_value..].iter().take_while(|&x| x == &run_byte).for_each(|_|run_len += 1);
if run_len >= min_match{
let run_start = cur_o_pos; debug_assert!(trgt[run_start..].iter().take_while(|x|*x == &run_byte).count() == run_len,"{:?}",&trgt[run_start-min_match_value..run_start+run_len]);
ops.push(InnerOp::Run{byte:run_byte as u8,length:run_len,o_pos:run_start});
state = EncoderState::FoundMatch { match_len: run_len };
continue;
}
}
if let Some(matcher) = src_matcher.as_mut(){
if cur_o_pos + 9 <= max_trgt_match_len{
debug_assert!(matcher.fwd_pos == cur_o_pos, "lh.pos {} != cur o {}",matcher.fwd_pos,cur_o_pos);
if let Some((src_start,pre_match,post_match)) = matcher.find_best_src_match(src, trgt) {
let length = pre_match + post_match;
if post_match >= min_match{
let trgt_match_start = cur_o_pos - pre_match;
if pre_match > 0{
clear_existing_ops(&mut ops, trgt_match_start)
}
let src_match_start = src_start - pre_match;
debug_assert!(src_match_start + length <= src_len);
debug_assert!(src[src_match_start..src_match_start+length] == trgt[trgt_match_start..trgt_match_start+length]);
ops.push(InnerOp::MatchSrc{start:src_match_start, length, o_pos:trgt_match_start});
state = EncoderState::FoundMatch { match_len: post_match };
continue;
}
}
}
}
if let Some(matcher) = trgt_matcher.as_mut(){
if cur_o_pos + 4 <= max_trgt_match_len{
debug_assert!(matcher.fwd_pos == cur_o_pos, "sh.pos {} != cur o {}",matcher.fwd_pos,cur_o_pos);
if let Some((match_start,length)) = matcher.find_best_trgt_match(trgt,min_match) {
if length >= min_match{
debug_assert!(match_start + length <= cur_o_pos);
debug_assert!(trgt[match_start..match_start+length] == trgt[cur_o_pos..cur_o_pos+length]);
ops.push(InnerOp::MatchTrgt{start:match_start, length, o_pos:cur_o_pos});
state = EncoderState::FoundMatch { match_len: length };
continue;
}
}
}
}
if min_match > min_match_value{
min_match -= 1;
}
state = EncoderState::MoveForwardOneByte;
},
EncoderState::FoundMatch{ match_len } => {
if lazy_escape_len > 0
&& match_len < lazy_escape_len
&& cur_o_pos + match_len < max_trgt_match_len - 1
{
min_match = match_len;
state = EncoderState::MoveForwardOneByte;
}else{
cur_o_pos += match_len;
state = EncoderState::StartNewMatch;
}
},
EncoderState::MoveForwardOneByte => {
let next_char = trgt[cur_o_pos + min_match_value];
if run_byte == next_char{
run_len += 1;
}else{
run_len = 1;
run_byte = next_char;
}
if let Some(matcher) = trgt_matcher.as_mut() {
matcher.store(matcher.fwd_hash as usize, cur_o_pos);
if matcher.fwd_pos < matcher.max_fwd_hash_pos{
matcher.fwd_hash = update_small_checksum_fwd(matcher.fwd_hash, trgt[cur_o_pos], trgt[cur_o_pos+4]);
matcher.fwd_pos = cur_o_pos+1;
}
}
if let Some(m) = src_matcher.as_mut(){
if m.fwd_pos < m.max_fwd_hash_pos{
m.fwd_hash = update_large_checksum_fwd(m.fwd_hash, trgt[cur_o_pos], trgt[cur_o_pos+9]);
m.fwd_pos = cur_o_pos+1;
}
}
cur_o_pos += 1;
state = EncoderState::TryMatch;
},
}
}
assert!(cur_o_pos<=trgt_len);
if max_trgt_match_len < trgt_len{
ops.push(InnerOp::MatchSrc { start: 0, length: src_len, o_pos: max_trgt_match_len });
}
ops
}
enum EncoderState{
StartNewMatch,
TryMatch,
FoundMatch{match_len:usize},
MoveForwardOneByte,
}
#[inline(always)]
fn find_initial_run_len(seg: &[u8], match_len: usize, run_byte: &mut u8) -> usize {
let mut run_len = 0;
let mut last_byte = *run_byte;
for i in 0..match_len {
let next_char = seg[i];
if last_byte == next_char{
run_len += 1;
}else{
run_len = 1;
last_byte = next_char;
}
}
*run_byte = last_byte;
run_len
}
#[inline(always)]
fn clear_existing_ops(ops:&mut Vec<InnerOp>,gte_start:usize){
while ops.last().map(|x|*x.o_pos()).unwrap_or(0) >= gte_start{
let _evicted = ops.pop().unwrap();
}
}
#[derive(Debug, Clone)]
pub struct GenericEncoderConfig{
pub match_trgt: Option<TrgtMatcherConfig>,
pub match_src: Option<SrcMatcherConfig>,
pub lazy_escape_len: Option<usize>,
pub naive_tests: Option<LargerTrgtNaiveTests>,
}
#[allow(unused)]
impl GenericEncoderConfig {
pub fn new(match_trgt: Option<TrgtMatcherConfig>, match_src: Option<SrcMatcherConfig>, naive_tests:Option<LargerTrgtNaiveTests>,lazy_escape_len:Option<usize>) -> Self {
Self { match_trgt, match_src,naive_tests,lazy_escape_len }
}
}
impl Default for GenericEncoderConfig {
fn default() -> Self {
Self { match_trgt: None, match_src: Some(SrcMatcherConfig::comp_level(3)),naive_tests: Some(LargerTrgtNaiveTests::Append),lazy_escape_len: Some(45)}
}
}