Skip to main content

speck_core/delta/
builder.rs

1//! Delta construction using suffix array for optimal matches
2
3use alloc::vec::Vec;
4use crate::delta::{Delta, Op, MAX_HUNK_SIZE};
5use crate::error::Result;
6
7/// Builder for creating optimized binary deltas
8#[derive(Debug)]
9pub struct DeltaBuilder {
10    min_match_len: usize,
11    max_search_depth: usize,
12}
13
14impl Default for DeltaBuilder {
15    fn default() -> Self {
16        Self {
17            min_match_len: 8,
18            max_search_depth: 100,
19        }
20    }
21}
22
23impl DeltaBuilder {
24    /// Create new builder with default settings
25    pub fn new() -> Self {
26        Self::default()
27    }
28    
29    /// Set minimum match length (smaller = better compression, slower)
30    pub fn min_match_len(mut self, len: usize) -> Self {
31        self.min_match_len = len.max(4);
32        self
33    }
34    
35    /// Build delta from old to new
36    pub fn build(&self, old: &[u8], new: &[u8]) -> Result<Delta> {
37        if old.is_empty() {
38            // Pure insert
39            return Ok(Delta {
40                source_size: 0,
41                target_size: new.len() as u64,
42                ops: vec![Op::Insert { data: new.to_vec() }],
43            });
44        }
45        
46        let mut ops = Vec::new();
47        let mut new_pos: usize = 0;
48        
49        // Build suffix array for old data
50        let sa = SuffixArray::new(old);
51        
52        while new_pos < new.len() {
53            // Find best match
54            let best = self.find_best_match(&sa, old, &new[new_pos..], new_pos);
55            
56            if let Some((src_off, match_len)) = best {
57                if match_len >= self.min_match_len {
58                    // Emit copy
59                    ops.push(Op::Copy {
60                        src_offset: src_off as u64,
61                        length: match_len as u64,
62                    });
63                    new_pos += match_len;
64                    continue;
65                }
66            }
67            
68            // No good match, gather literals
69            let literal_start = new_pos;
70            let mut literal_len = 1;
71            
72            while literal_len < MAX_HUNK_SIZE 
73                && new_pos + literal_len < new.len() {
74                // Check if starting a match would be better
75                if literal_len >= self.min_match_len {
76                    let lookahead = &new[new_pos + literal_len..];
77                    if self.find_best_match(&sa, old, lookahead, new_pos + literal_len)
78                        .map_or(false, |(_, len)| len >= self.min_match_len) {
79                        break;
80                    }
81                }
82                literal_len += 1;
83            }
84            
85            ops.push(Op::Insert {
86                data: new[literal_start..literal_start + literal_len].to_vec(),
87            });
88            new_pos += literal_len;
89        }
90        
91        // Coalesce adjacent copies
92        let ops = self.coalesce_copies(ops);
93        
94        Ok(Delta {
95            source_size: old.len() as u64,
96            target_size: new.len() as u64,
97            ops,
98        })
99    }
100    
101    fn find_best_match(&self, sa: &SuffixArray, old: &[u8], new_suffix: &[u8], _new_pos: usize) -> Option<(usize, usize)> {
102        if new_suffix.len() < self.min_match_len {
103            return None;
104        }
105        
106        // Binary search for best prefix match
107        let mut best_len = 0;
108        let mut best_pos = 0;
109        
110        // Simple approach: sample positions
111        let step = (old.len() / self.max_search_depth).max(1);
112        for i in (0..old.len()).step_by(step) {
113            let len = common_prefix_length(&old[i..], new_suffix);
114            if len > best_len && len >= self.min_match_len {
115                best_len = len;
116                best_pos = i;
117                if best_len == new_suffix.len() {
118                    break;
119                }
120            }
121        }
122        
123        if best_len >= self.min_match_len {
124            Some((best_pos, best_len))
125        } else {
126            None
127        }
128    }
129    
130    fn coalesce_copies(&self, ops: Vec<Op>) -> Vec<Op> {
131        let mut result = Vec::with_capacity(ops.len());
132
133        for op in ops {
134            if let Some(last) = result.last_mut() {
135                match (last, &op) {
136                    (Op::Copy { src_offset, length }, Op::Copy {
137                        src_offset: next_src,
138                        length: next_len,
139                    }) => {
140                        // Check if adjacent in both source and destination
141                        if *src_offset + *length == *next_src {
142                            *length += *next_len;
143                            continue;
144                        }
145                    }
146                    (Op::Insert { data }, Op::Insert { data: next_data }) => {
147                        if data.len() + next_data.len() <= MAX_HUNK_SIZE {
148                            data.extend_from_slice(next_data);
149                            continue;
150                        }
151                    }
152                    _ => {}
153                }
154            }
155            result.push(op);
156        }
157
158        result
159    }
160}
161
162/// Simple suffix array implementation for substring search
163struct SuffixArray<'a> {
164    text: &'a [u8],
165    sa: Vec<usize>,
166}
167
168impl<'a> SuffixArray<'a> {
169    fn new(text: &'a [u8]) -> Self {
170        let mut sa: Vec<usize> = (0..text.len()).collect();
171        sa.sort_by(|&a, &b| text[a..].cmp(&text[b..]));
172        Self { text, sa }
173    }
174}
175
176fn common_prefix_length(a: &[u8], b: &[u8]) -> usize {
177    a.iter().zip(b.iter()).take_while(|(x, y)| x == y).count()
178}