Skip to main content

objects/delta/
delta_encoder.rs

1// SPDX-License-Identifier: Apache-2.0
2//! Delta encoder using Git-style compact copy instructions.
3//!
4//! Copy instruction format (identical to Git):
5//! ```text
6//! Byte 0: 1oooosss
7//!   o bits (3-6): which offset bytes follow (up to 4 → 32-bit offset)
8//!   s bits (0-2): which size bytes follow (up to 3 → 24-bit size; all zero = 0x10000)
9//! [offset bytes, low to high, only present if corresponding o-bit is set]
10//! [size bytes, low to high, only present if corresponding s-bit is set]
11//! ```
12//!
13//! Insert instruction: `[length-1] [literal bytes]` (max 127 bytes per chunk).
14
15use std::collections::HashMap;
16
17/// Minimum match length for targets >= 1024 bytes.
18const MIN_MATCH_LENGTH_LARGE: usize = 16;
19/// Minimum match length for small targets (< 1024 bytes).
20const MIN_MATCH_LENGTH_SMALL: usize = 8;
21
22/// Delta encoder.
23#[derive(Debug)]
24pub struct DeltaEncoder;
25
26impl DeltaEncoder {
27    /// Create a new delta encoder.
28    pub fn new() -> Self {
29        Self
30    }
31
32    /// Encode a delta from base to target.
33    pub fn encode(base: &[u8], target: &[u8]) -> Vec<u8> {
34        if base.is_empty() {
35            return Self::encode_insert(target);
36        }
37
38        let index = Self::build_index(base);
39        Self::encode_with_index(&index, base, target)
40    }
41
42    /// Encode a delta using a pre-built index (avoids rebuilding for sliding window).
43    pub fn encode_with_index(
44        index: &HashMap<[u8; 4], Vec<usize>>,
45        base: &[u8],
46        target: &[u8],
47    ) -> Vec<u8> {
48        if base.is_empty() {
49            return Self::encode_insert(target);
50        }
51
52        let min_match = Self::min_match_for(target.len());
53        let mut delta = Vec::new();
54        let mut pos = 0;
55
56        while pos < target.len() {
57            if let Some((offset, length)) =
58                Self::find_best_match(index, base, target, pos, min_match)
59            {
60                Self::emit_copy(&mut delta, offset, length);
61                pos += length;
62            } else {
63                let start = pos;
64                while pos < target.len() && pos - start < 127 {
65                    if Self::find_best_match(index, base, target, pos, min_match).is_some() {
66                        break;
67                    }
68                    pos += 1;
69                }
70
71                let len = pos - start;
72                delta.push(len as u8 - 1);
73                delta.extend_from_slice(&target[start..pos]);
74            }
75        }
76
77        delta
78    }
79
80    /// Estimate the encoded delta size without allocating the output.
81    pub fn estimate_delta_size(base: &[u8], target: &[u8]) -> usize {
82        if base.is_empty() {
83            return target.len() + target.len().div_ceil(128);
84        }
85
86        let index = Self::build_index(base);
87        Self::estimate_delta_size_with_index(&index, base, target)
88    }
89
90    /// Estimate delta size using a pre-built index (avoids rebuilding for sliding window).
91    pub fn estimate_delta_size_with_index(
92        index: &HashMap<[u8; 4], Vec<usize>>,
93        base: &[u8],
94        target: &[u8],
95    ) -> usize {
96        if base.is_empty() {
97            return target.len() + target.len().div_ceil(128);
98        }
99
100        let min_match = Self::min_match_for(target.len());
101        let mut size = 0usize;
102        let mut pos = 0;
103
104        while pos < target.len() {
105            if let Some((offset, length)) =
106                Self::find_best_match(index, base, target, pos, min_match)
107            {
108                size += Self::copy_instruction_size(offset, length);
109                pos += length;
110            } else {
111                let start = pos;
112                while pos < target.len() && pos - start < 127 {
113                    if Self::find_best_match(index, base, target, pos, min_match).is_some() {
114                        break;
115                    }
116                    pos += 1;
117                }
118                size += 1 + (pos - start);
119            }
120        }
121
122        size
123    }
124
125    /// Build a 4-byte hash index over the base data.
126    pub fn build_index(base: &[u8]) -> HashMap<[u8; 4], Vec<usize>> {
127        let mut index: HashMap<[u8; 4], Vec<usize>> = HashMap::new();
128
129        for i in 0..base.len().saturating_sub(4) {
130            let key = [base[i], base[i + 1], base[i + 2], base[i + 3]];
131            index.entry(key).or_default().push(i);
132        }
133
134        index
135    }
136
137    /// Emit a Git-style copy instruction.
138    ///
139    /// Format: `1sssoooo [offset bytes] [size bytes]`
140    /// - Bit 7: copy flag (always 1)
141    /// - Bits 0-3 (o): which offset bytes (0-3) are present
142    /// - Bits 4-6 (s): which size bytes (0-2) are present
143    /// - If no s bits set, size = 0x10000
144    fn emit_copy(delta: &mut Vec<u8>, offset: usize, length: usize) {
145        let mut cmd: u8 = 0x80;
146        let offset = offset as u32;
147        let length = length as u32;
148
149        // Offset byte flags: bits 0-3
150        // Always emit at least offset byte 0 to avoid the reserved cmd=0x80
151        // (which occurs when offset=0 and length=0x10000).
152        cmd |= 0x01; // always include offset byte 0
153        if offset & 0xFF00 != 0 {
154            cmd |= 0x02;
155        }
156        if offset & 0xFF_0000 != 0 {
157            cmd |= 0x04;
158        }
159        if offset & 0xFF00_0000 != 0 {
160            cmd |= 0x08;
161        }
162
163        // Size byte flags: bits 4-6
164        // Special case: size == 0x10000 is encoded as no size bytes (all s bits zero)
165        if length != 0x10000 {
166            if length & 0xFF != 0 {
167                cmd |= 0x10;
168            }
169            if length & 0xFF00 != 0 {
170                cmd |= 0x20;
171            }
172            if length & 0xFF_0000 != 0 {
173                cmd |= 0x40;
174            }
175        }
176
177        delta.push(cmd);
178
179        // Emit offset bytes (low to high), only those flagged
180        delta.push(offset as u8); // always present (bit 0 always set)
181        if offset & 0xFF00 != 0 {
182            delta.push((offset >> 8) as u8);
183        }
184        if offset & 0xFF_0000 != 0 {
185            delta.push((offset >> 16) as u8);
186        }
187        if offset & 0xFF00_0000 != 0 {
188            delta.push((offset >> 24) as u8);
189        }
190
191        // Emit size bytes (low to high), only those flagged
192        if length != 0x10000 {
193            if length & 0xFF != 0 {
194                delta.push(length as u8);
195            }
196            if length & 0xFF00 != 0 {
197                delta.push((length >> 8) as u8);
198            }
199            if length & 0xFF_0000 != 0 {
200                delta.push((length >> 16) as u8);
201            }
202        }
203    }
204
205    /// Calculate the byte size of a Git-style copy instruction.
206    fn copy_instruction_size(offset: usize, length: usize) -> usize {
207        let offset = offset as u32;
208        let length = length as u32;
209        let mut n = 1 + 1; // flag byte + offset byte 0 (always present)
210
211        // Additional offset bytes (bits 1-3)
212        if offset & 0xFF00 != 0 {
213            n += 1;
214        }
215        if offset & 0xFF_0000 != 0 {
216            n += 1;
217        }
218        if offset & 0xFF00_0000 != 0 {
219            n += 1;
220        }
221
222        // Size bytes (bits 4-6); 0x10000 = no bytes
223        if length != 0x10000 {
224            if length & 0xFF != 0 {
225                n += 1;
226            }
227            if length & 0xFF00 != 0 {
228                n += 1;
229            }
230            if length & 0xFF_0000 != 0 {
231                n += 1;
232            }
233        }
234
235        n
236    }
237
238    /// Choose minimum match length based on target size.
239    fn min_match_for(target_len: usize) -> usize {
240        if target_len < 1024 {
241            MIN_MATCH_LENGTH_SMALL
242        } else {
243            MIN_MATCH_LENGTH_LARGE
244        }
245    }
246
247    fn encode_insert(data: &[u8]) -> Vec<u8> {
248        let mut delta = Vec::new();
249        for chunk in data.chunks(128) {
250            delta.push((chunk.len() - 1) as u8);
251            delta.extend_from_slice(chunk);
252        }
253        delta
254    }
255
256    fn find_best_match(
257        index: &HashMap<[u8; 4], Vec<usize>>,
258        base: &[u8],
259        target: &[u8],
260        pos: usize,
261        min_match: usize,
262    ) -> Option<(usize, usize)> {
263        if pos + 4 > target.len() {
264            return None;
265        }
266
267        let key = [
268            target[pos],
269            target[pos + 1],
270            target[pos + 2],
271            target[pos + 3],
272        ];
273        let offsets = index.get(&key)?;
274
275        let mut best_offset = 0;
276        let mut best_length = 0;
277
278        for &offset in offsets {
279            let length = Self::match_length(base, offset, target, pos);
280            if length > best_length {
281                best_length = length;
282                best_offset = offset;
283            }
284        }
285
286        if best_length >= min_match {
287            Some((best_offset, best_length))
288        } else {
289            None
290        }
291    }
292
293    fn match_length(base: &[u8], base_pos: usize, target: &[u8], target_pos: usize) -> usize {
294        let max_len = (base.len() - base_pos).min(target.len() - target_pos);
295        let mut len = 0;
296        while len < max_len && base[base_pos + len] == target[target_pos + len] {
297            len += 1;
298        }
299        len
300    }
301}
302
303impl Default for DeltaEncoder {
304    fn default() -> Self {
305        Self::new()
306    }
307}