delta-compression 0.5.0

Binary delta compression — diff and apply for byte sequences, with rolling-hash matching, splay-tree indexing, and in-place patching.
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
use std::collections::VecDeque;

use crate::hash::{fingerprint, next_prime, RollingHash};
use crate::splay::SplayTree;
use crate::types::{Command, DiffOptions};

/// One entry in the correction lookback buffer (Section 5.2).
///
/// The correcting algorithm may discover that a newly found match overlaps
/// commands already emitted.  The buffer holds the most recent buf_cap tentative
/// commands so they can be trimmed or cancelled (tail correction) when a better
/// match is found.  Commands are flushed to the output list as they age out.
struct BufEntry {
    /// First V byte covered by this entry.
    v_start: usize,
    /// One past the last V byte covered.
    v_end: usize,
    /// The tentative command (Add or Copy).
    cmd: Command,
    /// Reserved; always false in the current implementation.
    dummy: bool,
}

/// Flat hash-table slot for correcting — sentinel-based (no Option overhead).
/// Empty slots have fp == u64::MAX.
#[derive(Clone, Copy)]
struct CSlot {
    fp: u64,
    offset: usize,
}

const EMPTY_CSLOT: CSlot = CSlot {
    fp: u64::MAX,
    offset: 0,
};

/// Correcting 1.5-Pass algorithm (Section 7, Figure 8) with
/// fingerprint-based checkpointing (Section 8).
///
/// The hash table is auto-sized to max(q, 2 * num_seeds / p) so that
/// checkpoint spacing m ≈ p, giving near-seed-granularity sampling.
/// TABLE_SIZE acts as a floor for small files.
///
/// |C| = q (hash table capacity, auto-sized from input).
/// |F| = next_prime(2 * num_R_seeds) (footprint universe, Section 8.1,
///       pp. 347-348: "|F| ≈ 2L").
/// m  = ceil(|F| / |C|) (checkpoint spacing, p. 348).
/// k  = checkpoint class (Eq. 3, p. 348).
///
/// A seed with fingerprint fp passes the checkpoint test iff
///     (fp % |F|) % m == k.
/// Its table index is (fp % |F|) / m  (p. 348: "i = floor(f/m)").
///
/// Step 1 (R pass): compute fingerprint at every R position, apply
/// checkpoint filter, store first-found offset per slot.
/// Steps 3-4 (V scan): compute fingerprint at every V position, apply
/// checkpoint filter, look up matching R offset.
/// Step 5: extend match both forwards and backwards (Section 7, p. 345).
/// Step 6: encode with tail correction via lookback buffer (Section 5.1).
/// Backward extension (Section 8.2, p. 349) recovers true match starts
/// that fall between checkpoint positions.
pub fn diff_correcting(
    r: &[u8],
    v: &[u8],
    opts: &DiffOptions,
) -> Vec<Command> {
    let p = opts.p;
    let q = opts.q;
    let buf_cap = opts.buf_cap;
    let verbose = opts.verbose;
    let use_splay = opts.use_splay;

    let mut commands = Vec::new();
    if v.is_empty() {
        return commands;
    }

    // ── Checkpointing parameters (Section 8.1, pp. 347-348) ─────────
    let num_seeds = if r.len() >= p { r.len() - p + 1 } else { 0 };
    let max_table = opts.max_table;
    // Auto-size: 2x factor for correcting's |F|=2L convention.
    // Capped at max_table to prevent runaway allocation on huge inputs.
    let cap = if num_seeds > 0 {
        next_prime(max_table.min(q.max(2 * num_seeds / p)))
    } else {
        next_prime(q.min(max_table))
    }; // |C|
    let f_size: u64 = if num_seeds > 0 {
        next_prime(2 * num_seeds) as u64 // |F|
    } else {
        1
    };
    let m: u64 = if f_size <= cap as u64 {
        1
    } else {
        (f_size + cap as u64 - 1) / cap as u64 // ceil(|F| / |C|)
    };
    // Biased k (p. 348).
    let k: u64 = if v.len() >= p {
        fingerprint(v, (v.len() / 2).min(v.len() - p), p) % f_size % m
    } else {
        0
    };

    if verbose {
        let expected = if m > 0 { num_seeds as u64 / m } else { 0 };
        let occ_est = if cap > 0 { expected * 100 / cap as u64 } else { 0 };
        eprintln!(
            "correcting: {}, |C|={} |F|={} m={} k={}\n  \
             checkpoint gap={} bytes, expected fill ~{} (~{}% table occupancy)\n  \
             table memory ~{} MB",
            if use_splay { "splay tree" } else { "hash table" },
            cap, f_size, m, k,
            m, expected, occ_est,
            cap * std::mem::size_of::<CSlot>() / 1_048_576
        );
    }

    // Debug counters
    let mut dbg_build_passed: usize = 0;
    let mut dbg_build_stored: usize = 0;
    let mut dbg_build_probes: usize = 0; // extra slots scanned past the first
    let mut dbg_scan_checkpoints: usize = 0;
    let mut dbg_scan_match: usize = 0;
    let mut dbg_scan_fp_mismatch: usize = 0;
    let mut dbg_scan_byte_mismatch: usize = 0;

    // Step (1): Build lookup structure for R (first-found policy, linear probing)
    // Flat slot array — fp == u64::MAX marks empty slots.
    let mut h_r_ht: Vec<CSlot> = if !use_splay { vec![EMPTY_CSLOT; cap] } else { Vec::new() };
    let mut h_r_sp: SplayTree<(u64, usize)> = SplayTree::new(); // (full_fp, offset)

    let build_start = std::time::Instant::now();
    let mut rh_r = if num_seeds > 0 { Some(RollingHash::new(r, 0, p)) } else { None };
    for a in 0..num_seeds {
        let fp = if a == 0 {
            rh_r.as_ref().unwrap().value()
        } else {
            let rh = rh_r.as_mut().unwrap();
            rh.roll(r[a - 1], r[a + p - 1]);
            rh.value()
        };
        let f = fp % f_size;
        if f % m != k {
            continue; // not a checkpoint seed
        }
        dbg_build_passed += 1;

        if use_splay {
            // insert_or_get implements first-found policy
            let val = h_r_sp.insert_or_get(fp, (fp, a));
            if val.1 == a {
                dbg_build_stored += 1;
            } else {
                dbg_build_probes += 1;
            }
        } else {
            // Linear probing: find empty slot or existing fp (first-found policy).
            // Uses branch-based wraparound (i += 1; if i == cap { i = 0 }) rather
            // than modulo to avoid division in the hot path.
            let mut i = (f / m) as usize;
            let i0 = i;
            let mut store = true;
            while h_r_ht[i].fp != u64::MAX {
                if h_r_ht[i].fp == fp { store = false; break; } // same fp already stored — skip
                i += 1; if i == cap { i = 0; }
                dbg_build_probes += 1;
                if i == i0 { store = false; break; }             // table full (safety)
            }
            if store {
                h_r_ht[i] = CSlot { fp, offset: a }; // first-found (Section 7 Step 1)
                dbg_build_stored += 1;
            }
        }
    }

    if verbose {
        let passed_pct = if num_seeds > 0 {
            dbg_build_passed as f64 / num_seeds as f64 * 100.0
        } else {
            0.0
        };
        let stored_count = if use_splay { h_r_sp.len() } else { dbg_build_stored };
        let occ_pct = if cap > 0 {
            stored_count as f64 / cap as f64 * 100.0
        } else {
            0.0
        };
        eprintln!(
            "  build: {} seeds, {} passed checkpoint ({:.2}%), \
             {} stored, {} extra probes\n  \
             build: table occupancy {}/{} ({:.1}%), elapsed {:.2}s",
            num_seeds, dbg_build_passed, passed_pct,
            dbg_build_stored, dbg_build_probes,
            stored_count, cap, occ_pct,
            build_start.elapsed().as_secs_f64()
        );
    }

    // Lookup helper — linear probing mirrors the build chain.
    let lookup_r = |h_r_ht: &[CSlot], h_r_sp: &mut SplayTree<(u64, usize)>, fp_v: u64, f_v: u64| -> Option<(u64, usize)> {
        if use_splay {
            h_r_sp.find(fp_v).copied()
        } else {
            let mut i = (f_v / m) as usize;
            let i0 = i;
            while h_r_ht[i].fp != u64::MAX {
                if h_r_ht[i].fp == fp_v { return Some((fp_v, h_r_ht[i].offset)); }
                i += 1; if i == cap { i = 0; }
                if i == i0 { return None; } // full table — not found
            }
            None
        }
    };

    // ── Encoding lookback buffer (Section 5.2) ───────────────────────
    let mut buf: VecDeque<BufEntry> = VecDeque::new();

    let flush_buf = |buf: &mut VecDeque<BufEntry>, commands: &mut Vec<Command>| {
        for entry in buf.drain(..) {
            if !entry.dummy {
                commands.push(entry.cmd);
            }
        }
    };

    // Step (2): initialize scan pointers
    let mut v_c: usize = 0;
    let mut v_s: usize = 0;

    // Rolling hash for O(1) per-position V fingerprinting.
    let v_seeds = if v.len() >= p { v.len() - p + 1 } else { 0 };
    let mut rh_v = if v_seeds > 0 { Some(RollingHash::new(v, 0, p)) } else { None };
    let mut rh_v_pos: usize = 0;

    while v_c + p <= v.len() {
        // Step (3): condition in while header.

        // Step (4): generate footprint at v_c, apply checkpoint test.
        let fp_v = if let Some(ref mut rh) = rh_v {
            if v_c == rh_v_pos {
                rh.value()
            } else if v_c == rh_v_pos + 1 {
                rh.roll(v[v_c - 1], v[v_c + p - 1]);
                rh_v_pos = v_c;
                rh.value()
            } else {
                *rh = RollingHash::new(v, v_c, p);
                rh_v_pos = v_c;
                rh.value()
            }
        } else {
            break;
        };
        let f_v = fp_v % f_size;
        if f_v % m != k {
            v_c += 1;
            continue; // not a checkpoint — skip
        }

        // Checkpoint passed — look up R.
        dbg_scan_checkpoints += 1;

        let entry = lookup_r(&h_r_ht, &mut h_r_sp, fp_v, f_v);

        let r_offset = match entry {
            Some((stored_fp, offset)) if stored_fp == fp_v => {
                // Full fingerprint matches — verify bytes.
                if r[offset..offset + p] != v[v_c..v_c + p] {
                    dbg_scan_byte_mismatch += 1;
                    v_c += 1;
                    continue;
                }
                dbg_scan_match += 1;
                offset
            }
            Some(_) => {
                dbg_scan_fp_mismatch += 1;
                v_c += 1;
                continue;
            }
            None => {
                v_c += 1;
                continue;
            }
        };

        // Step (5): extend match forwards and backwards
        // (Section 7, Step 5; Section 8.2 backward extension, p. 349)
        // Pre-compute max extension, compare slices (one bounds check).
        let max_fwd = (v.len() - v_c).min(r.len() - r_offset);
        let fwd = p + v[v_c + p..v_c + max_fwd]
            .iter()
            .zip(&r[r_offset + p..r_offset + max_fwd])
            .position(|(a, b)| a != b)
            .unwrap_or(max_fwd - p);

        let max_bwd = v_c.min(r_offset);
        let bwd = if max_bwd == 0 {
            0
        } else {
            v[v_c - max_bwd..v_c]
                .iter()
                .rev()
                .zip(r[r_offset - max_bwd..r_offset].iter().rev())
                .position(|(a, b)| a != b)
                .unwrap_or(max_bwd)
        };

        let v_m = v_c - bwd;
        let r_m = r_offset - bwd;
        let ml = bwd + fwd;
        let match_end = v_m + ml;

        // Step (6): encode with correction
        if v_s <= v_m {
            // (6a) match is entirely in unencoded suffix (Section 7)
            if v_s < v_m {
                if buf.len() >= buf_cap {
                    let oldest = buf.pop_front().unwrap();
                    if !oldest.dummy {
                        commands.push(oldest.cmd);
                    }
                }
                buf.push_back(BufEntry {
                    v_start: v_s,
                    v_end: v_m,
                    cmd: Command::Add {
                        data: v[v_s..v_m].to_vec(),
                    },
                    dummy: false,
                });
            }
            if buf.len() >= buf_cap {
                let oldest = buf.pop_front().unwrap();
                if !oldest.dummy {
                    commands.push(oldest.cmd);
                }
            }
            buf.push_back(BufEntry {
                v_start: v_m,
                v_end: match_end,
                cmd: Command::Copy {
                    offset: r_m,
                    length: ml,
                },
                dummy: false,
            });
            v_s = match_end;
        } else {
            // (6b) match extends backward into encoded prefix —
            // tail correction (Section 5.1, p. 339)
            let mut effective_start = v_s;

            while let Some(tail) = buf.back() {
                if tail.dummy {
                    buf.pop_back();
                    continue;
                }

                if tail.v_start >= v_m && tail.v_end <= match_end {
                    // Wholly within new match — absorb
                    effective_start = effective_start.min(tail.v_start);
                    buf.pop_back();
                    continue;
                }

                if tail.v_end > v_m && tail.v_start < v_m {
                    if matches!(tail.cmd, Command::Add { .. }) {
                        // Partial add — trim to [v_start, v_m)
                        let keep = v_m - tail.v_start;
                        if keep > 0 {
                            let back = buf.back_mut().unwrap();
                            back.cmd = Command::Add {
                                data: v[back.v_start..v_m].to_vec(),
                            };
                            back.v_end = v_m;
                        } else {
                            buf.pop_back();
                        }
                        effective_start = effective_start.min(v_m);
                    }
                    // Partial copy — don't reclaim (Section 5.1)
                    break;
                }

                // No overlap with match
                break;
            }

            let adj = effective_start - v_m;
            let new_len = match_end - effective_start;
            if new_len > 0 {
                if buf.len() >= buf_cap {
                    let oldest = buf.pop_front().unwrap();
                    if !oldest.dummy {
                        commands.push(oldest.cmd);
                    }
                }
                buf.push_back(BufEntry {
                    v_start: effective_start,
                    v_end: match_end,
                    cmd: Command::Copy {
                        offset: r_m + adj,
                        length: new_len,
                    },
                    dummy: false,
                });
            }
            v_s = match_end;
        }

        // Step (7): advance past matched region
        v_c = match_end;
    }

    // Step (8): flush buffer and trailing add
    flush_buf(&mut buf, &mut commands);
    if v_s < v.len() {
        commands.push(Command::Add {
            data: v[v_s..].to_vec(),
        });
    }

    if verbose {
        let v_seeds = if v.len() >= p { v.len() - p + 1 } else { 0 };
        let cp_pct = if v_seeds > 0 {
            dbg_scan_checkpoints as f64 / v_seeds as f64 * 100.0
        } else {
            0.0
        };
        let hit_pct = if dbg_scan_checkpoints > 0 {
            dbg_scan_match as f64 / dbg_scan_checkpoints as f64 * 100.0
        } else {
            0.0
        };
        eprintln!(
            "  scan: {} V positions, {} checkpoints ({:.3}%), {} matches\n  \
             scan: hit rate {:.1}% (of checkpoints), \
             fp collisions {}, byte mismatches {}",
            v_seeds, dbg_scan_checkpoints, cp_pct, dbg_scan_match,
            hit_pct, dbg_scan_fp_mismatch, dbg_scan_byte_mismatch
        );
        super::print_command_stats(&commands);
    }

    commands
}

/// Convenience wrapper with default parameters.
pub fn diff_correcting_default(r: &[u8], v: &[u8]) -> Vec<Command> {
    diff_correcting(r, v, &DiffOptions::default())
}