Skip to main content

delta/algorithm/
onepass.rs

1use crate::hash::{next_prime, RollingHash};
2use crate::splay::SplayTree;
3use crate::types::{Command, DiffOptions};
4
5/// Flat hash-table slot — sentinel-based (no Option overhead).
6/// Empty/stale slots have version == u64::MAX (valid versions start at 0).
7#[derive(Clone, Copy)]
8struct Slot {
9    fp: u64,
10    offset: usize,
11    version: u64,
12}
13
14const EMPTY_SLOT: Slot = Slot {
15    fp: 0,
16    offset: 0,
17    version: u64::MAX,
18};
19
20/// One-Pass algorithm (Section 4.1, Figure 3).
21///
22/// Scans R and V concurrently with two hash tables (one per string).
23/// Each table stores at most one offset per footprint (retain-existing
24/// policy: first entry wins, later collisions are discarded).
25/// Hash tables are logically flushed after each match via version counter
26/// (next-match policy).
27/// Time: O(np + q), space: O(q) — both constant for fixed p, q (Section 4.2).
28/// Suboptimal on transpositions: cannot match blocks that appear in
29/// different order between R and V (Section 4.3).
30///
31/// The hash table is auto-sized to max(q, num_seeds / p) so that large
32/// inputs get one slot per seed-length chunk of R.  TABLE_SIZE acts as a
33/// floor for small files.
34pub fn diff_onepass(r: &[u8], v: &[u8], opts: &DiffOptions) -> Vec<Command> {
35    let p = opts.p;
36    let q = opts.q;
37    let verbose = opts.verbose;
38    let use_splay = opts.use_splay;
39
40    let mut commands = Vec::new();
41    if v.is_empty() {
42        return commands;
43    }
44
45    // Auto-size hash table: one slot per p-byte chunk of R (floor = q).
46    let num_seeds = if r.len() >= p { r.len() - p + 1 } else { 0 };
47    let q = next_prime(q.max(num_seeds / p));
48
49    if verbose {
50        eprintln!(
51            "onepass: {}, q={}, |R|={}, |V|={}, seed_len={}",
52            if use_splay { "splay tree" } else { "hash table" },
53            q, r.len(), v.len(), p
54        );
55    }
56
57    // Step (1): lookup structures with version-based logical flushing.
58    // Flat slot array — sentinel version u64::MAX marks empty/stale slots.
59    let mut h_v_ht: Vec<Slot> = if !use_splay { vec![EMPTY_SLOT; q] } else { Vec::new() };
60    let mut h_r_ht: Vec<Slot> = if !use_splay { vec![EMPTY_SLOT; q] } else { Vec::new() };
61
62    // Splay tree path: value is (offset, version)
63    let mut h_v_sp: SplayTree<(usize, u64)> = SplayTree::new();
64    let mut h_r_sp: SplayTree<(usize, u64)> = SplayTree::new();
65
66    let mut ver: u64 = 0;
67
68    // Debug counters (verbose mode only)
69    let mut dbg_positions: usize = 0;
70    let mut dbg_lookups: usize = 0;
71    let mut dbg_matches: usize = 0;
72
73    // Step (2): initialize scan pointers
74    let mut r_c: usize = 0;
75    let mut v_c: usize = 0;
76    let mut v_s: usize = 0;
77
78    // Rolling hashes for O(1) per-position fingerprinting.
79    // Initialized lazily on first use and reinitialized after match jumps.
80    let mut rh_v: Option<RollingHash> = if v.len() >= p { Some(RollingHash::new(v, 0, p)) } else { None };
81    let mut rh_r: Option<RollingHash> = if r.len() >= p { Some(RollingHash::new(r, 0, p)) } else { None };
82    let mut rh_v_pos: usize = 0; // position rh_v currently represents
83    let mut rh_r_pos: usize = 0;
84
85    loop {
86        // Step (3): check for end of V and R
87        let can_v = v_c + p <= v.len();
88        let can_r = r_c + p <= r.len();
89
90        if !can_v && !can_r {
91            break;
92        }
93        dbg_positions += 1;
94
95        let fp_v = if can_v {
96            if let Some(ref mut rh) = rh_v {
97                if v_c == rh_v_pos {
98                    // Already at the right position
99                } else if v_c == rh_v_pos + 1 {
100                    rh.roll(v[v_c - 1], v[v_c + p - 1]);
101                    rh_v_pos = v_c;
102                } else {
103                    // Jump — reinitialize
104                    *rh = RollingHash::new(v, v_c, p);
105                    rh_v_pos = v_c;
106                }
107                Some(rh.value())
108            } else {
109                None
110            }
111        } else {
112            None
113        };
114        let fp_r = if can_r {
115            if let Some(ref mut rh) = rh_r {
116                if r_c == rh_r_pos {
117                    // Already at the right position
118                } else if r_c == rh_r_pos + 1 {
119                    rh.roll(r[r_c - 1], r[r_c + p - 1]);
120                    rh_r_pos = r_c;
121                } else {
122                    *rh = RollingHash::new(r, r_c, p);
123                    rh_r_pos = r_c;
124                }
125                Some(rh.value())
126            } else {
127                None
128            }
129        } else {
130            None
131        };
132
133        // Step (4a): store offsets (retain-existing policy)
134        if let Some(fp) = fp_v {
135            if use_splay {
136                let existing = h_v_sp.find(fp);
137                if existing.is_none() || existing.unwrap().1 != ver {
138                    h_v_sp.insert(fp, (v_c, ver));
139                }
140            } else {
141                let idx = (fp % q as u64) as usize;
142                let slot = &mut h_v_ht[idx];
143                if slot.version != ver {
144                    *slot = Slot { fp, offset: v_c, version: ver };
145                }
146            }
147        }
148        if let Some(fp) = fp_r {
149            if use_splay {
150                let existing = h_r_sp.find(fp);
151                if existing.is_none() || existing.unwrap().1 != ver {
152                    h_r_sp.insert(fp, (r_c, ver));
153                }
154            } else {
155                let idx = (fp % q as u64) as usize;
156                let slot = &mut h_r_ht[idx];
157                if slot.version != ver {
158                    *slot = Slot { fp, offset: r_c, version: ver };
159                }
160            }
161        }
162
163        // Step (4b): look for a matching seed in the other table
164        let mut match_found = false;
165        let mut r_m: usize = 0;
166        let mut v_m: usize = 0;
167
168        if let Some(fp) = fp_r {
169            let v_cand = if use_splay {
170                h_v_sp.find(fp).and_then(|&mut (off, v)| if v == ver { Some(off) } else { None })
171            } else {
172                let idx = (fp % q as u64) as usize;
173                let slot = &h_v_ht[idx];
174                if slot.version == ver && slot.fp == fp { Some(slot.offset) } else { None }
175            };
176            if let Some(v_cand) = v_cand {
177                dbg_lookups += 1;
178                if r[r_c..r_c + p] == v[v_cand..v_cand + p] {
179                    r_m = r_c;
180                    v_m = v_cand;
181                    match_found = true;
182                }
183            }
184        }
185
186        if !match_found {
187            if let Some(fp) = fp_v {
188                let r_cand = if use_splay {
189                    h_r_sp.find(fp).and_then(|&mut (off, v)| if v == ver { Some(off) } else { None })
190                } else {
191                    let idx = (fp % q as u64) as usize;
192                    let slot = &h_r_ht[idx];
193                    if slot.version == ver && slot.fp == fp { Some(slot.offset) } else { None }
194                };
195                if let Some(r_cand) = r_cand {
196                    dbg_lookups += 1;
197                    if v[v_c..v_c + p] == r[r_cand..r_cand + p] {
198                        v_m = v_c;
199                        r_m = r_cand;
200                        match_found = true;
201                    }
202                }
203            }
204        }
205
206        if !match_found {
207            v_c += 1;
208            r_c += 1;
209            continue;
210        }
211        dbg_matches += 1;
212
213        // Step (5): extend match forward
214        // Pre-compute max extension, then compare slices (one bounds check
215        // instead of per-byte).
216        let max_ext = (v.len() - v_m).min(r.len() - r_m);
217        let ml = v[v_m..v_m + max_ext]
218            .iter()
219            .zip(&r[r_m..r_m + max_ext])
220            .position(|(a, b)| a != b)
221            .unwrap_or(max_ext);
222
223        // Step (6): encode
224        if v_s < v_m {
225            commands.push(Command::Add {
226                data: v[v_s..v_m].to_vec(),
227            });
228        }
229        commands.push(Command::Copy {
230            offset: r_m,
231            length: ml,
232        });
233        v_s = v_m + ml;
234
235        // Step (7): advance pointers and flush tables
236        v_c = v_m + ml;
237        r_c = r_m + ml;
238        ver += 1;
239    }
240
241    // Step (8): trailing add
242    if v_s < v.len() {
243        commands.push(Command::Add {
244            data: v[v_s..].to_vec(),
245        });
246    }
247
248    if verbose {
249        let hit_pct = if dbg_lookups > 0 { dbg_matches as f64 / dbg_lookups as f64 * 100.0 } else { 0.0 };
250        eprintln!(
251            "  scan: {} positions, {} lookups, {} matches (flushes)\n  \
252             scan: hit rate {:.1}% (of lookups)",
253            dbg_positions, dbg_lookups, dbg_matches, hit_pct
254        );
255        super::print_command_stats(&commands);
256    }
257
258    commands
259}
260
261/// Convenience wrapper with default parameters.
262pub fn diff_onepass_default(r: &[u8], v: &[u8]) -> Vec<Command> {
263    diff_onepass(r, v, &DiffOptions::default())
264}