use std::collections::VecDeque;
use crate::hash::{fingerprint, next_prime, RollingHash};
use crate::splay::SplayTree;
use crate::types::{Command, DiffOptions};
struct BufEntry {
v_start: usize,
v_end: usize,
cmd: Command,
dummy: bool,
}
#[derive(Clone, Copy)]
struct CSlot {
fp: u64,
offset: usize,
}
const EMPTY_CSLOT: CSlot = CSlot {
fp: u64::MAX,
offset: 0,
};
pub fn diff_correcting(
r: &[u8],
v: &[u8],
opts: &DiffOptions,
) -> Vec<Command> {
let p = opts.p;
let q = opts.q;
let buf_cap = opts.buf_cap;
let verbose = opts.verbose;
let use_splay = opts.use_splay;
let mut commands = Vec::new();
if v.is_empty() {
return commands;
}
let num_seeds = if r.len() >= p { r.len() - p + 1 } else { 0 };
let max_table = opts.max_table;
let cap = if num_seeds > 0 {
next_prime(max_table.min(q.max(2 * num_seeds / p)))
} else {
next_prime(q.min(max_table))
}; let f_size: u64 = if num_seeds > 0 {
next_prime(2 * num_seeds) as u64 } else {
1
};
let m: u64 = if f_size <= cap as u64 {
1
} else {
(f_size + cap as u64 - 1) / cap as u64 };
let k: u64 = if v.len() >= p {
fingerprint(v, (v.len() / 2).min(v.len() - p), p) % f_size % m
} else {
0
};
if verbose {
let expected = if m > 0 { num_seeds as u64 / m } else { 0 };
let occ_est = if cap > 0 { expected * 100 / cap as u64 } else { 0 };
eprintln!(
"correcting: {}, |C|={} |F|={} m={} k={}\n \
checkpoint gap={} bytes, expected fill ~{} (~{}% table occupancy)\n \
table memory ~{} MB",
if use_splay { "splay tree" } else { "hash table" },
cap, f_size, m, k,
m, expected, occ_est,
cap * std::mem::size_of::<CSlot>() / 1_048_576
);
}
let mut dbg_build_passed: usize = 0;
let mut dbg_build_stored: usize = 0;
let mut dbg_build_probes: usize = 0; let mut dbg_scan_checkpoints: usize = 0;
let mut dbg_scan_match: usize = 0;
let mut dbg_scan_fp_mismatch: usize = 0;
let mut dbg_scan_byte_mismatch: usize = 0;
let mut h_r_ht: Vec<CSlot> = if !use_splay { vec![EMPTY_CSLOT; cap] } else { Vec::new() };
let mut h_r_sp: SplayTree<(u64, usize)> = SplayTree::new();
let build_start = std::time::Instant::now();
let mut rh_r = if num_seeds > 0 { Some(RollingHash::new(r, 0, p)) } else { None };
for a in 0..num_seeds {
let fp = if a == 0 {
rh_r.as_ref().unwrap().value()
} else {
let rh = rh_r.as_mut().unwrap();
rh.roll(r[a - 1], r[a + p - 1]);
rh.value()
};
let f = fp % f_size;
if f % m != k {
continue; }
dbg_build_passed += 1;
if use_splay {
let val = h_r_sp.insert_or_get(fp, (fp, a));
if val.1 == a {
dbg_build_stored += 1;
} else {
dbg_build_probes += 1;
}
} else {
let mut i = (f / m) as usize;
let i0 = i;
let mut store = true;
while h_r_ht[i].fp != u64::MAX {
if h_r_ht[i].fp == fp { store = false; break; } i += 1; if i == cap { i = 0; }
dbg_build_probes += 1;
if i == i0 { store = false; break; } }
if store {
h_r_ht[i] = CSlot { fp, offset: a }; dbg_build_stored += 1;
}
}
}
if verbose {
let passed_pct = if num_seeds > 0 {
dbg_build_passed as f64 / num_seeds as f64 * 100.0
} else {
0.0
};
let stored_count = if use_splay { h_r_sp.len() } else { dbg_build_stored };
let occ_pct = if cap > 0 {
stored_count as f64 / cap as f64 * 100.0
} else {
0.0
};
eprintln!(
" build: {} seeds, {} passed checkpoint ({:.2}%), \
{} stored, {} extra probes\n \
build: table occupancy {}/{} ({:.1}%), elapsed {:.2}s",
num_seeds, dbg_build_passed, passed_pct,
dbg_build_stored, dbg_build_probes,
stored_count, cap, occ_pct,
build_start.elapsed().as_secs_f64()
);
}
let lookup_r = |h_r_ht: &[CSlot], h_r_sp: &mut SplayTree<(u64, usize)>, fp_v: u64, f_v: u64| -> Option<(u64, usize)> {
if use_splay {
h_r_sp.find(fp_v).copied()
} else {
let mut i = (f_v / m) as usize;
let i0 = i;
while h_r_ht[i].fp != u64::MAX {
if h_r_ht[i].fp == fp_v { return Some((fp_v, h_r_ht[i].offset)); }
i += 1; if i == cap { i = 0; }
if i == i0 { return None; } }
None
}
};
let mut buf: VecDeque<BufEntry> = VecDeque::new();
let flush_buf = |buf: &mut VecDeque<BufEntry>, commands: &mut Vec<Command>| {
for entry in buf.drain(..) {
if !entry.dummy {
commands.push(entry.cmd);
}
}
};
let mut v_c: usize = 0;
let mut v_s: usize = 0;
let v_seeds = if v.len() >= p { v.len() - p + 1 } else { 0 };
let mut rh_v = if v_seeds > 0 { Some(RollingHash::new(v, 0, p)) } else { None };
let mut rh_v_pos: usize = 0;
while v_c + p <= v.len() {
let fp_v = if let Some(ref mut rh) = rh_v {
if v_c == rh_v_pos {
rh.value()
} else if v_c == rh_v_pos + 1 {
rh.roll(v[v_c - 1], v[v_c + p - 1]);
rh_v_pos = v_c;
rh.value()
} else {
*rh = RollingHash::new(v, v_c, p);
rh_v_pos = v_c;
rh.value()
}
} else {
break;
};
let f_v = fp_v % f_size;
if f_v % m != k {
v_c += 1;
continue; }
dbg_scan_checkpoints += 1;
let entry = lookup_r(&h_r_ht, &mut h_r_sp, fp_v, f_v);
let r_offset = match entry {
Some((stored_fp, offset)) if stored_fp == fp_v => {
if r[offset..offset + p] != v[v_c..v_c + p] {
dbg_scan_byte_mismatch += 1;
v_c += 1;
continue;
}
dbg_scan_match += 1;
offset
}
Some(_) => {
dbg_scan_fp_mismatch += 1;
v_c += 1;
continue;
}
None => {
v_c += 1;
continue;
}
};
let max_fwd = (v.len() - v_c).min(r.len() - r_offset);
let fwd = p + v[v_c + p..v_c + max_fwd]
.iter()
.zip(&r[r_offset + p..r_offset + max_fwd])
.position(|(a, b)| a != b)
.unwrap_or(max_fwd - p);
let max_bwd = v_c.min(r_offset);
let bwd = if max_bwd == 0 {
0
} else {
v[v_c - max_bwd..v_c]
.iter()
.rev()
.zip(r[r_offset - max_bwd..r_offset].iter().rev())
.position(|(a, b)| a != b)
.unwrap_or(max_bwd)
};
let v_m = v_c - bwd;
let r_m = r_offset - bwd;
let ml = bwd + fwd;
let match_end = v_m + ml;
if v_s <= v_m {
if v_s < v_m {
if buf.len() >= buf_cap {
let oldest = buf.pop_front().unwrap();
if !oldest.dummy {
commands.push(oldest.cmd);
}
}
buf.push_back(BufEntry {
v_start: v_s,
v_end: v_m,
cmd: Command::Add {
data: v[v_s..v_m].to_vec(),
},
dummy: false,
});
}
if buf.len() >= buf_cap {
let oldest = buf.pop_front().unwrap();
if !oldest.dummy {
commands.push(oldest.cmd);
}
}
buf.push_back(BufEntry {
v_start: v_m,
v_end: match_end,
cmd: Command::Copy {
offset: r_m,
length: ml,
},
dummy: false,
});
v_s = match_end;
} else {
let mut effective_start = v_s;
while let Some(tail) = buf.back() {
if tail.dummy {
buf.pop_back();
continue;
}
if tail.v_start >= v_m && tail.v_end <= match_end {
effective_start = effective_start.min(tail.v_start);
buf.pop_back();
continue;
}
if tail.v_end > v_m && tail.v_start < v_m {
if matches!(tail.cmd, Command::Add { .. }) {
let keep = v_m - tail.v_start;
if keep > 0 {
let back = buf.back_mut().unwrap();
back.cmd = Command::Add {
data: v[back.v_start..v_m].to_vec(),
};
back.v_end = v_m;
} else {
buf.pop_back();
}
effective_start = effective_start.min(v_m);
}
break;
}
break;
}
let adj = effective_start - v_m;
let new_len = match_end - effective_start;
if new_len > 0 {
if buf.len() >= buf_cap {
let oldest = buf.pop_front().unwrap();
if !oldest.dummy {
commands.push(oldest.cmd);
}
}
buf.push_back(BufEntry {
v_start: effective_start,
v_end: match_end,
cmd: Command::Copy {
offset: r_m + adj,
length: new_len,
},
dummy: false,
});
}
v_s = match_end;
}
v_c = match_end;
}
flush_buf(&mut buf, &mut commands);
if v_s < v.len() {
commands.push(Command::Add {
data: v[v_s..].to_vec(),
});
}
if verbose {
let v_seeds = if v.len() >= p { v.len() - p + 1 } else { 0 };
let cp_pct = if v_seeds > 0 {
dbg_scan_checkpoints as f64 / v_seeds as f64 * 100.0
} else {
0.0
};
let hit_pct = if dbg_scan_checkpoints > 0 {
dbg_scan_match as f64 / dbg_scan_checkpoints as f64 * 100.0
} else {
0.0
};
eprintln!(
" scan: {} V positions, {} checkpoints ({:.3}%), {} matches\n \
scan: hit rate {:.1}% (of checkpoints), \
fp collisions {}, byte mismatches {}",
v_seeds, dbg_scan_checkpoints, cp_pct, dbg_scan_match,
hit_pct, dbg_scan_fp_mismatch, dbg_scan_byte_mismatch
);
super::print_command_stats(&commands);
}
commands
}
pub fn diff_correcting_default(r: &[u8], v: &[u8]) -> Vec<Command> {
diff_correcting(r, v, &DiffOptions::default())
}