1use std::collections::{HashMap, VecDeque};
5
6use heddle_format::{compression::CompressionConfig, delta::DeltaEncoder};
7
8use super::{
9 ObjectType, PackObjectId, PackObjectRecord, PackStats, append_container_checksum,
10 compress_pack_payload, encode_tagged_entry, encode_tagged_entry_parts, pack_container_spec,
11 pack_index::PackIndex, write_container_header,
12};
13use crate::{object::ContentHash, store::Result};
14
15const MIN_DELTA_SIZE: usize = 64;
16const MAX_DELTA_CHAIN_DEPTH: usize = 50;
18const WINDOW_SIZE: usize = 10;
20
21type GroupedPackMap = HashMap<ObjectType, Vec<PackObjectRecord>>;
22
23pub struct PackBuilder {
25 objects: Vec<PackObjectRecord>,
26 compression: CompressionConfig,
27}
28
29struct WindowEntry {
31 hash: ContentHash,
32 data: Vec<u8>,
33 index: HashMap<[u8; 4], Vec<usize>>,
34 chain_depth: usize,
35}
36
37impl PackBuilder {
38 pub fn new(compression: CompressionConfig) -> Self {
40 Self {
41 objects: Vec::new(),
42 compression,
43 }
44 }
45
46 pub fn add(&mut self, hash: ContentHash, obj_type: ObjectType, data: Vec<u8>) {
48 self.add_id(PackObjectId::Hash(hash), obj_type, data);
49 }
50
51 pub fn add_id(&mut self, id: PackObjectId, obj_type: ObjectType, data: Vec<u8>) {
52 self.objects.push(PackObjectRecord {
53 id,
54 obj_type,
55 data,
56 delta_base: None,
57 path_hint: None,
58 });
59 }
60
61 pub fn add_with_path(
67 &mut self,
68 hash: ContentHash,
69 obj_type: ObjectType,
70 data: Vec<u8>,
71 path: Option<String>,
72 ) {
73 self.add_with_path_id(PackObjectId::Hash(hash), obj_type, data, path);
74 }
75
76 pub fn add_with_path_id(
77 &mut self,
78 id: PackObjectId,
79 obj_type: ObjectType,
80 data: Vec<u8>,
81 path: Option<String>,
82 ) {
83 self.objects.push(PackObjectRecord {
84 id,
85 obj_type,
86 data,
87 delta_base: None,
88 path_hint: path,
89 });
90 }
91
92 pub fn build(self) -> Result<(Vec<u8>, Vec<u8>, PackStats)> {
96 let mut pack_data = Vec::new();
97 let mut index = PackIndex::new();
98
99 write_container_header(
100 &mut pack_data,
101 pack_container_spec(),
102 self.objects.len() as u64,
103 );
104
105 let mut total_uncompressed = 0u64;
106 let mut total_compressed = 0u64;
107 let mut delta_count = 0u64;
108
109 let object_count = self.objects.len() as u64;
110 let grouped = Self::group_by_type(self.objects);
111
112 for (obj_type, mut objects) in grouped {
113 if objects.len() < 2
114 || obj_type == ObjectType::State
115 || self.compression.max_delta_size == 0
116 {
117 for record in objects {
120 let offset = pack_data.len() as u64;
121 index.add(record.id, offset);
122
123 total_uncompressed += record.data.len() as u64;
124 let compressed = compress_pack_payload(&record.data, &self.compression)?;
125 total_compressed += compressed.len() as u64;
126
127 Self::write_entry(&mut pack_data, &record, obj_type, &compressed)?;
128 }
129 } else {
130 Self::sort_for_delta_window(&mut objects);
131 Self::encode_with_sliding_window(
132 &mut pack_data,
133 &mut index,
134 &mut total_uncompressed,
135 &mut total_compressed,
136 &mut delta_count,
137 obj_type,
138 objects,
139 &self.compression,
140 )?;
141 }
142 }
143
144 index.sort();
145
146 append_container_checksum(&mut pack_data);
147
148 let stats = PackStats {
149 object_count,
150 total_uncompressed,
151 total_compressed,
152 delta_count,
153 compression_ratio: total_compressed as f64 / total_uncompressed as f64,
154 };
155
156 Ok((pack_data, index.to_bytes(), stats))
157 }
158
159 fn sort_for_delta_window(objects: &mut [PackObjectRecord]) {
166 objects.sort_by(|a, b| {
167 let key_a = Self::sort_key(&a.path_hint);
168 let key_b = Self::sort_key(&b.path_hint);
169 key_a.cmp(&key_b).then(b.data.len().cmp(&a.data.len()))
170 });
171 }
172
173 fn sort_key(path: &Option<String>) -> (String, String) {
176 match path {
177 Some(p) => {
178 let filename = p.rsplit('/').next().unwrap_or(p);
179 if let Some(dot_pos) = filename.rfind('.') {
180 let ext = filename[dot_pos + 1..].to_string();
181 let stem = filename[..dot_pos].to_string();
182 (ext, stem)
183 } else {
184 (String::new(), filename.to_string())
185 }
186 }
187 None => ("\u{FFFF}".to_string(), String::new()),
188 }
189 }
190
191 #[allow(clippy::too_many_arguments)]
197 fn encode_with_sliding_window(
198 pack_data: &mut Vec<u8>,
199 index: &mut PackIndex,
200 total_uncompressed: &mut u64,
201 total_compressed: &mut u64,
202 delta_count: &mut u64,
203 obj_type: ObjectType,
204 objects: Vec<PackObjectRecord>,
205 compression: &CompressionConfig,
206 ) -> Result<()> {
207 let mut window: VecDeque<WindowEntry> = VecDeque::with_capacity(WINDOW_SIZE);
208
209 for record in objects {
210 let hash = match record.id {
211 PackObjectId::Hash(hash) => hash,
212 PackObjectId::ChangeId(_) => {
213 let offset = pack_data.len() as u64;
214 index.add(record.id, offset);
215 *total_uncompressed += record.data.len() as u64;
216 let compressed = compress_pack_payload(&record.data, compression)?;
217 *total_compressed += compressed.len() as u64;
218 Self::write_entry(pack_data, &record, obj_type, &compressed)?;
219 continue;
220 }
221 };
222 let data = record.data;
223 let offset = pack_data.len() as u64;
224 index.add(PackObjectId::Hash(hash), offset);
225 *total_uncompressed += data.len() as u64;
226
227 let mut best_base_idx: Option<usize> = None;
229 let mut best_delta_estimate = usize::MAX;
230
231 if data.len() >= MIN_DELTA_SIZE {
232 for (i, entry) in window.iter().enumerate() {
233 if entry.chain_depth >= MAX_DELTA_CHAIN_DEPTH {
235 continue;
236 }
237 if entry.data.len() < MIN_DELTA_SIZE {
239 continue;
240 }
241
242 let estimate = DeltaEncoder::estimate_delta_size_with_index(
243 &entry.index,
244 &entry.data,
245 &data,
246 );
247
248 if estimate < best_delta_estimate {
249 best_delta_estimate = estimate;
250 best_base_idx = Some(i);
251 }
252 }
253 }
254
255 let (final_data, entry_type, base_hash, chain_depth) =
257 if let Some(base_idx) = best_base_idx {
258 let base_entry = &window[base_idx];
259 let delta =
260 DeltaEncoder::encode_with_index(&base_entry.index, &base_entry.data, &data);
261 let delta_compressed = compress_pack_payload(&delta, compression)?;
262
263 if delta_compressed.len() < data.len() {
264 *delta_count += 1;
265 let bh = base_entry.hash;
266 let depth = base_entry.chain_depth + 1;
267 (delta_compressed, ObjectType::Delta, Some(bh), depth)
268 } else {
269 let compressed = compress_pack_payload(&data, compression)?;
270 (compressed, obj_type, None, 0)
271 }
272 } else {
273 let compressed = compress_pack_payload(&data, compression)?;
274 (compressed, obj_type, None, 0)
275 };
276
277 *total_compressed += final_data.len() as u64;
278
279 Self::write_entry_parts(
280 pack_data,
281 PackObjectId::Hash(hash),
282 entry_type,
283 data.len(),
284 base_hash.map(PackObjectId::Hash),
285 &final_data,
286 )?;
287
288 let entry_index = DeltaEncoder::build_index(&data);
290 if window.len() >= WINDOW_SIZE {
291 window.pop_front();
292 }
293 window.push_back(WindowEntry {
294 hash,
295 data,
296 index: entry_index,
297 chain_depth,
298 });
299 }
300
301 Ok(())
302 }
303
304 fn group_by_type(objects: Vec<PackObjectRecord>) -> GroupedPackMap {
305 let mut groups: GroupedPackMap = HashMap::new();
306
307 for record in objects {
308 groups.entry(record.obj_type).or_default().push(record);
309 }
310
311 groups
312 }
313
314 fn write_entry(
325 pack: &mut Vec<u8>,
326 record: &PackObjectRecord,
327 obj_type: ObjectType,
328 compressed: &[u8],
329 ) -> Result<()> {
330 encode_tagged_entry(pack, record, obj_type, compressed)
331 }
332
333 fn write_entry_parts(
334 pack: &mut Vec<u8>,
335 id: PackObjectId,
336 obj_type: ObjectType,
337 uncompressed_size: usize,
338 delta_base: Option<PackObjectId>,
339 compressed: &[u8],
340 ) -> Result<()> {
341 encode_tagged_entry_parts(
342 pack,
343 id,
344 obj_type,
345 uncompressed_size,
346 delta_base,
347 compressed,
348 )
349 }
350}