1use std::collections::VecDeque;
2
3use crate::hash::{fingerprint, next_prime, RollingHash};
4use crate::splay::SplayTree;
5use crate::types::{Command, DiffOptions};
6
7struct BufEntry {
14 v_start: usize,
16 v_end: usize,
18 cmd: Command,
20 dummy: bool,
22}
23
24#[derive(Clone, Copy)]
27struct CSlot {
28 fp: u64,
29 offset: usize,
30}
31
32const EMPTY_CSLOT: CSlot = CSlot {
33 fp: u64::MAX,
34 offset: 0,
35};
36
37pub fn diff_correcting(
63 r: &[u8],
64 v: &[u8],
65 opts: &DiffOptions,
66) -> Vec<Command> {
67 let p = opts.p;
68 let q = opts.q;
69 let buf_cap = opts.buf_cap;
70 let verbose = opts.verbose;
71 let use_splay = opts.use_splay;
72
73 let mut commands = Vec::new();
74 if v.is_empty() {
75 return commands;
76 }
77
78 let num_seeds = if r.len() >= p { r.len() - p + 1 } else { 0 };
80 let max_table = opts.max_table;
81 let cap = if num_seeds > 0 {
84 next_prime(max_table.min(q.max(2 * num_seeds / p)))
85 } else {
86 next_prime(q.min(max_table))
87 }; let f_size: u64 = if num_seeds > 0 {
89 next_prime(2 * num_seeds) as u64 } else {
91 1
92 };
93 let m: u64 = if f_size <= cap as u64 {
94 1
95 } else {
96 (f_size + cap as u64 - 1) / cap as u64 };
98 let k: u64 = if v.len() >= p {
100 fingerprint(v, (v.len() / 2).min(v.len() - p), p) % f_size % m
101 } else {
102 0
103 };
104
105 if verbose {
106 let expected = if m > 0 { num_seeds as u64 / m } else { 0 };
107 let occ_est = if cap > 0 { expected * 100 / cap as u64 } else { 0 };
108 eprintln!(
109 "correcting: {}, |C|={} |F|={} m={} k={}\n \
110 checkpoint gap={} bytes, expected fill ~{} (~{}% table occupancy)\n \
111 table memory ~{} MB",
112 if use_splay { "splay tree" } else { "hash table" },
113 cap, f_size, m, k,
114 m, expected, occ_est,
115 cap * std::mem::size_of::<CSlot>() / 1_048_576
116 );
117 }
118
119 let mut dbg_build_passed: usize = 0;
121 let mut dbg_build_stored: usize = 0;
122 let mut dbg_build_probes: usize = 0; let mut dbg_scan_checkpoints: usize = 0;
124 let mut dbg_scan_match: usize = 0;
125 let mut dbg_scan_fp_mismatch: usize = 0;
126 let mut dbg_scan_byte_mismatch: usize = 0;
127
128 let mut h_r_ht: Vec<CSlot> = if !use_splay { vec![EMPTY_CSLOT; cap] } else { Vec::new() };
131 let mut h_r_sp: SplayTree<(u64, usize)> = SplayTree::new(); let build_start = std::time::Instant::now();
134 let mut rh_r = if num_seeds > 0 { Some(RollingHash::new(r, 0, p)) } else { None };
135 for a in 0..num_seeds {
136 let fp = if a == 0 {
137 rh_r.as_ref().unwrap().value()
138 } else {
139 let rh = rh_r.as_mut().unwrap();
140 rh.roll(r[a - 1], r[a + p - 1]);
141 rh.value()
142 };
143 let f = fp % f_size;
144 if f % m != k {
145 continue; }
147 dbg_build_passed += 1;
148
149 if use_splay {
150 let val = h_r_sp.insert_or_get(fp, (fp, a));
152 if val.1 == a {
153 dbg_build_stored += 1;
154 } else {
155 dbg_build_probes += 1;
156 }
157 } else {
158 let mut i = (f / m) as usize;
162 let i0 = i;
163 let mut store = true;
164 while h_r_ht[i].fp != u64::MAX {
165 if h_r_ht[i].fp == fp { store = false; break; } i += 1; if i == cap { i = 0; }
167 dbg_build_probes += 1;
168 if i == i0 { store = false; break; } }
170 if store {
171 h_r_ht[i] = CSlot { fp, offset: a }; dbg_build_stored += 1;
173 }
174 }
175 }
176
177 if verbose {
178 let passed_pct = if num_seeds > 0 {
179 dbg_build_passed as f64 / num_seeds as f64 * 100.0
180 } else {
181 0.0
182 };
183 let stored_count = if use_splay { h_r_sp.len() } else { dbg_build_stored };
184 let occ_pct = if cap > 0 {
185 stored_count as f64 / cap as f64 * 100.0
186 } else {
187 0.0
188 };
189 eprintln!(
190 " build: {} seeds, {} passed checkpoint ({:.2}%), \
191 {} stored, {} extra probes\n \
192 build: table occupancy {}/{} ({:.1}%), elapsed {:.2}s",
193 num_seeds, dbg_build_passed, passed_pct,
194 dbg_build_stored, dbg_build_probes,
195 stored_count, cap, occ_pct,
196 build_start.elapsed().as_secs_f64()
197 );
198 }
199
200 let lookup_r = |h_r_ht: &[CSlot], h_r_sp: &mut SplayTree<(u64, usize)>, fp_v: u64, f_v: u64| -> Option<(u64, usize)> {
202 if use_splay {
203 h_r_sp.find(fp_v).copied()
204 } else {
205 let mut i = (f_v / m) as usize;
206 let i0 = i;
207 while h_r_ht[i].fp != u64::MAX {
208 if h_r_ht[i].fp == fp_v { return Some((fp_v, h_r_ht[i].offset)); }
209 i += 1; if i == cap { i = 0; }
210 if i == i0 { return None; } }
212 None
213 }
214 };
215
216 let mut buf: VecDeque<BufEntry> = VecDeque::new();
218
219 let flush_buf = |buf: &mut VecDeque<BufEntry>, commands: &mut Vec<Command>| {
220 for entry in buf.drain(..) {
221 if !entry.dummy {
222 commands.push(entry.cmd);
223 }
224 }
225 };
226
227 let mut v_c: usize = 0;
229 let mut v_s: usize = 0;
230
231 let v_seeds = if v.len() >= p { v.len() - p + 1 } else { 0 };
233 let mut rh_v = if v_seeds > 0 { Some(RollingHash::new(v, 0, p)) } else { None };
234 let mut rh_v_pos: usize = 0;
235
236 while v_c + p <= v.len() {
237 let fp_v = if let Some(ref mut rh) = rh_v {
241 if v_c == rh_v_pos {
242 rh.value()
243 } else if v_c == rh_v_pos + 1 {
244 rh.roll(v[v_c - 1], v[v_c + p - 1]);
245 rh_v_pos = v_c;
246 rh.value()
247 } else {
248 *rh = RollingHash::new(v, v_c, p);
249 rh_v_pos = v_c;
250 rh.value()
251 }
252 } else {
253 break;
254 };
255 let f_v = fp_v % f_size;
256 if f_v % m != k {
257 v_c += 1;
258 continue; }
260
261 dbg_scan_checkpoints += 1;
263
264 let entry = lookup_r(&h_r_ht, &mut h_r_sp, fp_v, f_v);
265
266 let r_offset = match entry {
267 Some((stored_fp, offset)) if stored_fp == fp_v => {
268 if r[offset..offset + p] != v[v_c..v_c + p] {
270 dbg_scan_byte_mismatch += 1;
271 v_c += 1;
272 continue;
273 }
274 dbg_scan_match += 1;
275 offset
276 }
277 Some(_) => {
278 dbg_scan_fp_mismatch += 1;
279 v_c += 1;
280 continue;
281 }
282 None => {
283 v_c += 1;
284 continue;
285 }
286 };
287
288 let max_fwd = (v.len() - v_c).min(r.len() - r_offset);
292 let fwd = p + v[v_c + p..v_c + max_fwd]
293 .iter()
294 .zip(&r[r_offset + p..r_offset + max_fwd])
295 .position(|(a, b)| a != b)
296 .unwrap_or(max_fwd - p);
297
298 let max_bwd = v_c.min(r_offset);
299 let bwd = if max_bwd == 0 {
300 0
301 } else {
302 v[v_c - max_bwd..v_c]
303 .iter()
304 .rev()
305 .zip(r[r_offset - max_bwd..r_offset].iter().rev())
306 .position(|(a, b)| a != b)
307 .unwrap_or(max_bwd)
308 };
309
310 let v_m = v_c - bwd;
311 let r_m = r_offset - bwd;
312 let ml = bwd + fwd;
313 let match_end = v_m + ml;
314
315 if v_s <= v_m {
317 if v_s < v_m {
319 if buf.len() >= buf_cap {
320 let oldest = buf.pop_front().unwrap();
321 if !oldest.dummy {
322 commands.push(oldest.cmd);
323 }
324 }
325 buf.push_back(BufEntry {
326 v_start: v_s,
327 v_end: v_m,
328 cmd: Command::Add {
329 data: v[v_s..v_m].to_vec(),
330 },
331 dummy: false,
332 });
333 }
334 if buf.len() >= buf_cap {
335 let oldest = buf.pop_front().unwrap();
336 if !oldest.dummy {
337 commands.push(oldest.cmd);
338 }
339 }
340 buf.push_back(BufEntry {
341 v_start: v_m,
342 v_end: match_end,
343 cmd: Command::Copy {
344 offset: r_m,
345 length: ml,
346 },
347 dummy: false,
348 });
349 v_s = match_end;
350 } else {
351 let mut effective_start = v_s;
354
355 while let Some(tail) = buf.back() {
356 if tail.dummy {
357 buf.pop_back();
358 continue;
359 }
360
361 if tail.v_start >= v_m && tail.v_end <= match_end {
362 effective_start = effective_start.min(tail.v_start);
364 buf.pop_back();
365 continue;
366 }
367
368 if tail.v_end > v_m && tail.v_start < v_m {
369 if matches!(tail.cmd, Command::Add { .. }) {
370 let keep = v_m - tail.v_start;
372 if keep > 0 {
373 let back = buf.back_mut().unwrap();
374 back.cmd = Command::Add {
375 data: v[back.v_start..v_m].to_vec(),
376 };
377 back.v_end = v_m;
378 } else {
379 buf.pop_back();
380 }
381 effective_start = effective_start.min(v_m);
382 }
383 break;
385 }
386
387 break;
389 }
390
391 let adj = effective_start - v_m;
392 let new_len = match_end - effective_start;
393 if new_len > 0 {
394 if buf.len() >= buf_cap {
395 let oldest = buf.pop_front().unwrap();
396 if !oldest.dummy {
397 commands.push(oldest.cmd);
398 }
399 }
400 buf.push_back(BufEntry {
401 v_start: effective_start,
402 v_end: match_end,
403 cmd: Command::Copy {
404 offset: r_m + adj,
405 length: new_len,
406 },
407 dummy: false,
408 });
409 }
410 v_s = match_end;
411 }
412
413 v_c = match_end;
415 }
416
417 flush_buf(&mut buf, &mut commands);
419 if v_s < v.len() {
420 commands.push(Command::Add {
421 data: v[v_s..].to_vec(),
422 });
423 }
424
425 if verbose {
426 let v_seeds = if v.len() >= p { v.len() - p + 1 } else { 0 };
427 let cp_pct = if v_seeds > 0 {
428 dbg_scan_checkpoints as f64 / v_seeds as f64 * 100.0
429 } else {
430 0.0
431 };
432 let hit_pct = if dbg_scan_checkpoints > 0 {
433 dbg_scan_match as f64 / dbg_scan_checkpoints as f64 * 100.0
434 } else {
435 0.0
436 };
437 eprintln!(
438 " scan: {} V positions, {} checkpoints ({:.3}%), {} matches\n \
439 scan: hit rate {:.1}% (of checkpoints), \
440 fp collisions {}, byte mismatches {}",
441 v_seeds, dbg_scan_checkpoints, cp_pct, dbg_scan_match,
442 hit_pct, dbg_scan_fp_mismatch, dbg_scan_byte_mismatch
443 );
444 super::print_command_stats(&commands);
445 }
446
447 commands
448}
449
450pub fn diff_correcting_default(r: &[u8], v: &[u8]) -> Vec<Command> {
452 diff_correcting(r, v, &DiffOptions::default())
453}