Skip to main content

delta/algorithm/
correcting.rs

1use std::collections::VecDeque;
2
3use crate::hash::{fingerprint, next_prime, RollingHash};
4use crate::splay::SplayTree;
5use crate::types::{Command, DiffOptions};
6
7/// One entry in the correction lookback buffer (Section 5.2).
8///
9/// The correcting algorithm may discover that a newly found match overlaps
10/// commands already emitted.  The buffer holds the most recent buf_cap tentative
11/// commands so they can be trimmed or cancelled (tail correction) when a better
12/// match is found.  Commands are flushed to the output list as they age out.
13struct BufEntry {
14    /// First V byte covered by this entry.
15    v_start: usize,
16    /// One past the last V byte covered.
17    v_end: usize,
18    /// The tentative command (Add or Copy).
19    cmd: Command,
20    /// Reserved; always false in the current implementation.
21    dummy: bool,
22}
23
24/// Flat hash-table slot for correcting — sentinel-based (no Option overhead).
25/// Empty slots have fp == u64::MAX.
26#[derive(Clone, Copy)]
27struct CSlot {
28    fp: u64,
29    offset: usize,
30}
31
32const EMPTY_CSLOT: CSlot = CSlot {
33    fp: u64::MAX,
34    offset: 0,
35};
36
37/// Correcting 1.5-Pass algorithm (Section 7, Figure 8) with
38/// fingerprint-based checkpointing (Section 8).
39///
40/// The hash table is auto-sized to max(q, 2 * num_seeds / p) so that
41/// checkpoint spacing m ≈ p, giving near-seed-granularity sampling.
42/// TABLE_SIZE acts as a floor for small files.
43///
44/// |C| = q (hash table capacity, auto-sized from input).
45/// |F| = next_prime(2 * num_R_seeds) (footprint universe, Section 8.1,
46///       pp. 347-348: "|F| ≈ 2L").
47/// m  = ceil(|F| / |C|) (checkpoint spacing, p. 348).
48/// k  = checkpoint class (Eq. 3, p. 348).
49///
50/// A seed with fingerprint fp passes the checkpoint test iff
51///     (fp % |F|) % m == k.
52/// Its table index is (fp % |F|) / m  (p. 348: "i = floor(f/m)").
53///
54/// Step 1 (R pass): compute fingerprint at every R position, apply
55/// checkpoint filter, store first-found offset per slot.
56/// Steps 3-4 (V scan): compute fingerprint at every V position, apply
57/// checkpoint filter, look up matching R offset.
58/// Step 5: extend match both forwards and backwards (Section 7, p. 345).
59/// Step 6: encode with tail correction via lookback buffer (Section 5.1).
60/// Backward extension (Section 8.2, p. 349) recovers true match starts
61/// that fall between checkpoint positions.
62pub fn diff_correcting(
63    r: &[u8],
64    v: &[u8],
65    opts: &DiffOptions,
66) -> Vec<Command> {
67    let p = opts.p;
68    let q = opts.q;
69    let buf_cap = opts.buf_cap;
70    let verbose = opts.verbose;
71    let use_splay = opts.use_splay;
72
73    let mut commands = Vec::new();
74    if v.is_empty() {
75        return commands;
76    }
77
78    // ── Checkpointing parameters (Section 8.1, pp. 347-348) ─────────
79    let num_seeds = if r.len() >= p { r.len() - p + 1 } else { 0 };
80    let max_table = opts.max_table;
81    // Auto-size: 2x factor for correcting's |F|=2L convention.
82    // Capped at max_table to prevent runaway allocation on huge inputs.
83    let cap = if num_seeds > 0 {
84        next_prime(max_table.min(q.max(2 * num_seeds / p)))
85    } else {
86        next_prime(q.min(max_table))
87    }; // |C|
88    let f_size: u64 = if num_seeds > 0 {
89        next_prime(2 * num_seeds) as u64 // |F|
90    } else {
91        1
92    };
93    let m: u64 = if f_size <= cap as u64 {
94        1
95    } else {
96        (f_size + cap as u64 - 1) / cap as u64 // ceil(|F| / |C|)
97    };
98    // Biased k (p. 348).
99    let k: u64 = if v.len() >= p {
100        fingerprint(v, (v.len() / 2).min(v.len() - p), p) % f_size % m
101    } else {
102        0
103    };
104
105    if verbose {
106        let expected = if m > 0 { num_seeds as u64 / m } else { 0 };
107        let occ_est = if cap > 0 { expected * 100 / cap as u64 } else { 0 };
108        eprintln!(
109            "correcting: {}, |C|={} |F|={} m={} k={}\n  \
110             checkpoint gap={} bytes, expected fill ~{} (~{}% table occupancy)\n  \
111             table memory ~{} MB",
112            if use_splay { "splay tree" } else { "hash table" },
113            cap, f_size, m, k,
114            m, expected, occ_est,
115            cap * std::mem::size_of::<CSlot>() / 1_048_576
116        );
117    }
118
119    // Debug counters
120    let mut dbg_build_passed: usize = 0;
121    let mut dbg_build_stored: usize = 0;
122    let mut dbg_build_probes: usize = 0; // extra slots scanned past the first
123    let mut dbg_scan_checkpoints: usize = 0;
124    let mut dbg_scan_match: usize = 0;
125    let mut dbg_scan_fp_mismatch: usize = 0;
126    let mut dbg_scan_byte_mismatch: usize = 0;
127
128    // Step (1): Build lookup structure for R (first-found policy, linear probing)
129    // Flat slot array — fp == u64::MAX marks empty slots.
130    let mut h_r_ht: Vec<CSlot> = if !use_splay { vec![EMPTY_CSLOT; cap] } else { Vec::new() };
131    let mut h_r_sp: SplayTree<(u64, usize)> = SplayTree::new(); // (full_fp, offset)
132
133    let build_start = std::time::Instant::now();
134    let mut rh_r = if num_seeds > 0 { Some(RollingHash::new(r, 0, p)) } else { None };
135    for a in 0..num_seeds {
136        let fp = if a == 0 {
137            rh_r.as_ref().unwrap().value()
138        } else {
139            let rh = rh_r.as_mut().unwrap();
140            rh.roll(r[a - 1], r[a + p - 1]);
141            rh.value()
142        };
143        let f = fp % f_size;
144        if f % m != k {
145            continue; // not a checkpoint seed
146        }
147        dbg_build_passed += 1;
148
149        if use_splay {
150            // insert_or_get implements first-found policy
151            let val = h_r_sp.insert_or_get(fp, (fp, a));
152            if val.1 == a {
153                dbg_build_stored += 1;
154            } else {
155                dbg_build_probes += 1;
156            }
157        } else {
158            // Linear probing: find empty slot or existing fp (first-found policy).
159            // Uses branch-based wraparound (i += 1; if i == cap { i = 0 }) rather
160            // than modulo to avoid division in the hot path.
161            let mut i = (f / m) as usize;
162            let i0 = i;
163            let mut store = true;
164            while h_r_ht[i].fp != u64::MAX {
165                if h_r_ht[i].fp == fp { store = false; break; } // same fp already stored — skip
166                i += 1; if i == cap { i = 0; }
167                dbg_build_probes += 1;
168                if i == i0 { store = false; break; }             // table full (safety)
169            }
170            if store {
171                h_r_ht[i] = CSlot { fp, offset: a }; // first-found (Section 7 Step 1)
172                dbg_build_stored += 1;
173            }
174        }
175    }
176
177    if verbose {
178        let passed_pct = if num_seeds > 0 {
179            dbg_build_passed as f64 / num_seeds as f64 * 100.0
180        } else {
181            0.0
182        };
183        let stored_count = if use_splay { h_r_sp.len() } else { dbg_build_stored };
184        let occ_pct = if cap > 0 {
185            stored_count as f64 / cap as f64 * 100.0
186        } else {
187            0.0
188        };
189        eprintln!(
190            "  build: {} seeds, {} passed checkpoint ({:.2}%), \
191             {} stored, {} extra probes\n  \
192             build: table occupancy {}/{} ({:.1}%), elapsed {:.2}s",
193            num_seeds, dbg_build_passed, passed_pct,
194            dbg_build_stored, dbg_build_probes,
195            stored_count, cap, occ_pct,
196            build_start.elapsed().as_secs_f64()
197        );
198    }
199
200    // Lookup helper — linear probing mirrors the build chain.
201    let lookup_r = |h_r_ht: &[CSlot], h_r_sp: &mut SplayTree<(u64, usize)>, fp_v: u64, f_v: u64| -> Option<(u64, usize)> {
202        if use_splay {
203            h_r_sp.find(fp_v).copied()
204        } else {
205            let mut i = (f_v / m) as usize;
206            let i0 = i;
207            while h_r_ht[i].fp != u64::MAX {
208                if h_r_ht[i].fp == fp_v { return Some((fp_v, h_r_ht[i].offset)); }
209                i += 1; if i == cap { i = 0; }
210                if i == i0 { return None; } // full table — not found
211            }
212            None
213        }
214    };
215
216    // ── Encoding lookback buffer (Section 5.2) ───────────────────────
217    let mut buf: VecDeque<BufEntry> = VecDeque::new();
218
219    let flush_buf = |buf: &mut VecDeque<BufEntry>, commands: &mut Vec<Command>| {
220        for entry in buf.drain(..) {
221            if !entry.dummy {
222                commands.push(entry.cmd);
223            }
224        }
225    };
226
227    // Step (2): initialize scan pointers
228    let mut v_c: usize = 0;
229    let mut v_s: usize = 0;
230
231    // Rolling hash for O(1) per-position V fingerprinting.
232    let v_seeds = if v.len() >= p { v.len() - p + 1 } else { 0 };
233    let mut rh_v = if v_seeds > 0 { Some(RollingHash::new(v, 0, p)) } else { None };
234    let mut rh_v_pos: usize = 0;
235
236    while v_c + p <= v.len() {
237        // Step (3): condition in while header.
238
239        // Step (4): generate footprint at v_c, apply checkpoint test.
240        let fp_v = if let Some(ref mut rh) = rh_v {
241            if v_c == rh_v_pos {
242                rh.value()
243            } else if v_c == rh_v_pos + 1 {
244                rh.roll(v[v_c - 1], v[v_c + p - 1]);
245                rh_v_pos = v_c;
246                rh.value()
247            } else {
248                *rh = RollingHash::new(v, v_c, p);
249                rh_v_pos = v_c;
250                rh.value()
251            }
252        } else {
253            break;
254        };
255        let f_v = fp_v % f_size;
256        if f_v % m != k {
257            v_c += 1;
258            continue; // not a checkpoint — skip
259        }
260
261        // Checkpoint passed — look up R.
262        dbg_scan_checkpoints += 1;
263
264        let entry = lookup_r(&h_r_ht, &mut h_r_sp, fp_v, f_v);
265
266        let r_offset = match entry {
267            Some((stored_fp, offset)) if stored_fp == fp_v => {
268                // Full fingerprint matches — verify bytes.
269                if r[offset..offset + p] != v[v_c..v_c + p] {
270                    dbg_scan_byte_mismatch += 1;
271                    v_c += 1;
272                    continue;
273                }
274                dbg_scan_match += 1;
275                offset
276            }
277            Some(_) => {
278                dbg_scan_fp_mismatch += 1;
279                v_c += 1;
280                continue;
281            }
282            None => {
283                v_c += 1;
284                continue;
285            }
286        };
287
288        // Step (5): extend match forwards and backwards
289        // (Section 7, Step 5; Section 8.2 backward extension, p. 349)
290        // Pre-compute max extension, compare slices (one bounds check).
291        let max_fwd = (v.len() - v_c).min(r.len() - r_offset);
292        let fwd = p + v[v_c + p..v_c + max_fwd]
293            .iter()
294            .zip(&r[r_offset + p..r_offset + max_fwd])
295            .position(|(a, b)| a != b)
296            .unwrap_or(max_fwd - p);
297
298        let max_bwd = v_c.min(r_offset);
299        let bwd = if max_bwd == 0 {
300            0
301        } else {
302            v[v_c - max_bwd..v_c]
303                .iter()
304                .rev()
305                .zip(r[r_offset - max_bwd..r_offset].iter().rev())
306                .position(|(a, b)| a != b)
307                .unwrap_or(max_bwd)
308        };
309
310        let v_m = v_c - bwd;
311        let r_m = r_offset - bwd;
312        let ml = bwd + fwd;
313        let match_end = v_m + ml;
314
315        // Step (6): encode with correction
316        if v_s <= v_m {
317            // (6a) match is entirely in unencoded suffix (Section 7)
318            if v_s < v_m {
319                if buf.len() >= buf_cap {
320                    let oldest = buf.pop_front().unwrap();
321                    if !oldest.dummy {
322                        commands.push(oldest.cmd);
323                    }
324                }
325                buf.push_back(BufEntry {
326                    v_start: v_s,
327                    v_end: v_m,
328                    cmd: Command::Add {
329                        data: v[v_s..v_m].to_vec(),
330                    },
331                    dummy: false,
332                });
333            }
334            if buf.len() >= buf_cap {
335                let oldest = buf.pop_front().unwrap();
336                if !oldest.dummy {
337                    commands.push(oldest.cmd);
338                }
339            }
340            buf.push_back(BufEntry {
341                v_start: v_m,
342                v_end: match_end,
343                cmd: Command::Copy {
344                    offset: r_m,
345                    length: ml,
346                },
347                dummy: false,
348            });
349            v_s = match_end;
350        } else {
351            // (6b) match extends backward into encoded prefix —
352            // tail correction (Section 5.1, p. 339)
353            let mut effective_start = v_s;
354
355            while let Some(tail) = buf.back() {
356                if tail.dummy {
357                    buf.pop_back();
358                    continue;
359                }
360
361                if tail.v_start >= v_m && tail.v_end <= match_end {
362                    // Wholly within new match — absorb
363                    effective_start = effective_start.min(tail.v_start);
364                    buf.pop_back();
365                    continue;
366                }
367
368                if tail.v_end > v_m && tail.v_start < v_m {
369                    if matches!(tail.cmd, Command::Add { .. }) {
370                        // Partial add — trim to [v_start, v_m)
371                        let keep = v_m - tail.v_start;
372                        if keep > 0 {
373                            let back = buf.back_mut().unwrap();
374                            back.cmd = Command::Add {
375                                data: v[back.v_start..v_m].to_vec(),
376                            };
377                            back.v_end = v_m;
378                        } else {
379                            buf.pop_back();
380                        }
381                        effective_start = effective_start.min(v_m);
382                    }
383                    // Partial copy — don't reclaim (Section 5.1)
384                    break;
385                }
386
387                // No overlap with match
388                break;
389            }
390
391            let adj = effective_start - v_m;
392            let new_len = match_end - effective_start;
393            if new_len > 0 {
394                if buf.len() >= buf_cap {
395                    let oldest = buf.pop_front().unwrap();
396                    if !oldest.dummy {
397                        commands.push(oldest.cmd);
398                    }
399                }
400                buf.push_back(BufEntry {
401                    v_start: effective_start,
402                    v_end: match_end,
403                    cmd: Command::Copy {
404                        offset: r_m + adj,
405                        length: new_len,
406                    },
407                    dummy: false,
408                });
409            }
410            v_s = match_end;
411        }
412
413        // Step (7): advance past matched region
414        v_c = match_end;
415    }
416
417    // Step (8): flush buffer and trailing add
418    flush_buf(&mut buf, &mut commands);
419    if v_s < v.len() {
420        commands.push(Command::Add {
421            data: v[v_s..].to_vec(),
422        });
423    }
424
425    if verbose {
426        let v_seeds = if v.len() >= p { v.len() - p + 1 } else { 0 };
427        let cp_pct = if v_seeds > 0 {
428            dbg_scan_checkpoints as f64 / v_seeds as f64 * 100.0
429        } else {
430            0.0
431        };
432        let hit_pct = if dbg_scan_checkpoints > 0 {
433            dbg_scan_match as f64 / dbg_scan_checkpoints as f64 * 100.0
434        } else {
435            0.0
436        };
437        eprintln!(
438            "  scan: {} V positions, {} checkpoints ({:.3}%), {} matches\n  \
439             scan: hit rate {:.1}% (of checkpoints), \
440             fp collisions {}, byte mismatches {}",
441            v_seeds, dbg_scan_checkpoints, cp_pct, dbg_scan_match,
442            hit_pct, dbg_scan_fp_mismatch, dbg_scan_byte_mismatch
443        );
444        super::print_command_stats(&commands);
445    }
446
447    commands
448}
449
450/// Convenience wrapper with default parameters.
451pub fn diff_correcting_default(r: &[u8], v: &[u8]) -> Vec<Command> {
452    diff_correcting(r, v, &DiffOptions::default())
453}