Skip to main content

objects/store/pack/
pack_builder.rs

1// SPDX-License-Identifier: Apache-2.0
2//! Pack builder for creating packfiles.
3
4use std::collections::{HashMap, VecDeque};
5
6use heddle_format::{compression::CompressionConfig, delta::DeltaEncoder};
7
8use super::{
9    ObjectType, PackObjectId, PackObjectRecord, PackStats, append_container_checksum,
10    compress_pack_payload, encode_tagged_entry, encode_tagged_entry_parts, pack_container_spec,
11    pack_index::PackIndex, write_container_header,
12};
13use crate::{object::ContentHash, store::Result};
14
15const MIN_DELTA_SIZE: usize = 64;
16/// Maximum depth for delta chains (matches Git's default).
17const MAX_DELTA_CHAIN_DEPTH: usize = 50;
18/// Number of recent objects to try as delta bases (Git default: 10).
19const WINDOW_SIZE: usize = 10;
20
21type GroupedPackMap = HashMap<ObjectType, Vec<PackObjectRecord>>;
22
23/// Pack builder for creating packfiles.
24pub struct PackBuilder {
25    objects: Vec<PackObjectRecord>,
26    compression: CompressionConfig,
27}
28
29/// A recent object in the sliding window, with cached hash index for fast delta estimation.
30struct WindowEntry {
31    hash: ContentHash,
32    data: Vec<u8>,
33    index: HashMap<[u8; 4], Vec<usize>>,
34    chain_depth: usize,
35}
36
37impl PackBuilder {
38    /// Create a new pack builder.
39    pub fn new(compression: CompressionConfig) -> Self {
40        Self {
41            objects: Vec::new(),
42            compression,
43        }
44    }
45
46    /// Add an object to the pack.
47    pub fn add(&mut self, hash: ContentHash, obj_type: ObjectType, data: Vec<u8>) {
48        self.add_id(PackObjectId::Hash(hash), obj_type, data);
49    }
50
51    pub fn add_id(&mut self, id: PackObjectId, obj_type: ObjectType, data: Vec<u8>) {
52        self.objects.push(PackObjectRecord {
53            id,
54            obj_type,
55            data,
56            delta_base: None,
57            path_hint: None,
58        });
59    }
60
61    /// Add an object with a path hint for better delta grouping.
62    ///
63    /// Objects sharing the same path (e.g. successive versions of `src/main.rs`)
64    /// will be sorted together for delta encoding, producing much better
65    /// compression ratios than size-only ordering.
66    pub fn add_with_path(
67        &mut self,
68        hash: ContentHash,
69        obj_type: ObjectType,
70        data: Vec<u8>,
71        path: Option<String>,
72    ) {
73        self.add_with_path_id(PackObjectId::Hash(hash), obj_type, data, path);
74    }
75
76    pub fn add_with_path_id(
77        &mut self,
78        id: PackObjectId,
79        obj_type: ObjectType,
80        data: Vec<u8>,
81        path: Option<String>,
82    ) {
83        self.objects.push(PackObjectRecord {
84            id,
85            obj_type,
86            data,
87            delta_base: None,
88            path_hint: path,
89        });
90    }
91
92    /// Build the packfile and index.
93    ///
94    /// Returns the pack data, index data, and statistics.
95    pub fn build(self) -> Result<(Vec<u8>, Vec<u8>, PackStats)> {
96        let mut pack_data = Vec::new();
97        let mut index = PackIndex::new();
98
99        write_container_header(
100            &mut pack_data,
101            pack_container_spec(),
102            self.objects.len() as u64,
103        );
104
105        let mut total_uncompressed = 0u64;
106        let mut total_compressed = 0u64;
107        let mut delta_count = 0u64;
108
109        let object_count = self.objects.len() as u64;
110        let grouped = Self::group_by_type(self.objects);
111
112        for (obj_type, mut objects) in grouped {
113            if objects.len() < 2
114                || obj_type == ObjectType::State
115                || self.compression.max_delta_size == 0
116            {
117                // Single objects, states, or transfer-tuned packs with delta disabled:
118                // write entries directly without running the sliding delta search.
119                for record in objects {
120                    let offset = pack_data.len() as u64;
121                    index.add(record.id, offset);
122
123                    total_uncompressed += record.data.len() as u64;
124                    let compressed = compress_pack_payload(&record.data, &self.compression)?;
125                    total_compressed += compressed.len() as u64;
126
127                    Self::write_entry(&mut pack_data, &record, obj_type, &compressed)?;
128                }
129            } else {
130                Self::sort_for_delta_window(&mut objects);
131                Self::encode_with_sliding_window(
132                    &mut pack_data,
133                    &mut index,
134                    &mut total_uncompressed,
135                    &mut total_compressed,
136                    &mut delta_count,
137                    obj_type,
138                    objects,
139                    &self.compression,
140                )?;
141            }
142        }
143
144        index.sort();
145
146        append_container_checksum(&mut pack_data);
147
148        let stats = PackStats {
149            object_count,
150            total_uncompressed,
151            total_compressed,
152            delta_count,
153            compression_ratio: total_compressed as f64 / total_uncompressed as f64,
154        };
155
156        Ok((pack_data, index.to_bytes(), stats))
157    }
158
159    /// Sort objects for optimal delta window traversal.
160    ///
161    /// Sorts by: file extension → basename → size descending.
162    /// This ensures files with the same extension are adjacent (like Git sorting
163    /// `.rs` files together), within that group same-named files are adjacent,
164    /// and within that the largest comes first (best delta base candidate).
165    fn sort_for_delta_window(objects: &mut [PackObjectRecord]) {
166        objects.sort_by(|a, b| {
167            let key_a = Self::sort_key(&a.path_hint);
168            let key_b = Self::sort_key(&b.path_hint);
169            key_a.cmp(&key_b).then(b.data.len().cmp(&a.data.len()))
170        });
171    }
172
173    /// Extract a sort key from a path: (extension, basename_without_extension).
174    /// Objects without paths sort last.
175    fn sort_key(path: &Option<String>) -> (String, String) {
176        match path {
177            Some(p) => {
178                let filename = p.rsplit('/').next().unwrap_or(p);
179                if let Some(dot_pos) = filename.rfind('.') {
180                    let ext = filename[dot_pos + 1..].to_string();
181                    let stem = filename[..dot_pos].to_string();
182                    (ext, stem)
183                } else {
184                    (String::new(), filename.to_string())
185                }
186            }
187            None => ("\u{FFFF}".to_string(), String::new()),
188        }
189    }
190
191    /// Encode objects using a sliding window for delta base selection.
192    ///
193    /// For each object, tries delta encoding against the W most recent objects
194    /// in the window, picking the base that produces the smallest delta. This
195    /// is the same approach Git uses with `--window=10`.
196    #[allow(clippy::too_many_arguments)]
197    fn encode_with_sliding_window(
198        pack_data: &mut Vec<u8>,
199        index: &mut PackIndex,
200        total_uncompressed: &mut u64,
201        total_compressed: &mut u64,
202        delta_count: &mut u64,
203        obj_type: ObjectType,
204        objects: Vec<PackObjectRecord>,
205        compression: &CompressionConfig,
206    ) -> Result<()> {
207        let mut window: VecDeque<WindowEntry> = VecDeque::with_capacity(WINDOW_SIZE);
208
209        for record in objects {
210            let hash = match record.id {
211                PackObjectId::Hash(hash) => hash,
212                PackObjectId::ChangeId(_) => {
213                    let offset = pack_data.len() as u64;
214                    index.add(record.id, offset);
215                    *total_uncompressed += record.data.len() as u64;
216                    let compressed = compress_pack_payload(&record.data, compression)?;
217                    *total_compressed += compressed.len() as u64;
218                    Self::write_entry(pack_data, &record, obj_type, &compressed)?;
219                    continue;
220                }
221            };
222            let data = record.data;
223            let offset = pack_data.len() as u64;
224            index.add(PackObjectId::Hash(hash), offset);
225            *total_uncompressed += data.len() as u64;
226
227            // Try delta against each window entry, pick the best
228            let mut best_base_idx: Option<usize> = None;
229            let mut best_delta_estimate = usize::MAX;
230
231            if data.len() >= MIN_DELTA_SIZE {
232                for (i, entry) in window.iter().enumerate() {
233                    // Skip if this base is already at max chain depth
234                    if entry.chain_depth >= MAX_DELTA_CHAIN_DEPTH {
235                        continue;
236                    }
237                    // Skip if base is too small
238                    if entry.data.len() < MIN_DELTA_SIZE {
239                        continue;
240                    }
241
242                    let estimate = DeltaEncoder::estimate_delta_size_with_index(
243                        &entry.index,
244                        &entry.data,
245                        &data,
246                    );
247
248                    if estimate < best_delta_estimate {
249                        best_delta_estimate = estimate;
250                        best_base_idx = Some(i);
251                    }
252                }
253            }
254
255            // Decide: delta or raw?
256            let (final_data, entry_type, base_hash, chain_depth) =
257                if let Some(base_idx) = best_base_idx {
258                    let base_entry = &window[base_idx];
259                    let delta =
260                        DeltaEncoder::encode_with_index(&base_entry.index, &base_entry.data, &data);
261                    let delta_compressed = compress_pack_payload(&delta, compression)?;
262
263                    if delta_compressed.len() < data.len() {
264                        *delta_count += 1;
265                        let bh = base_entry.hash;
266                        let depth = base_entry.chain_depth + 1;
267                        (delta_compressed, ObjectType::Delta, Some(bh), depth)
268                    } else {
269                        let compressed = compress_pack_payload(&data, compression)?;
270                        (compressed, obj_type, None, 0)
271                    }
272                } else {
273                    let compressed = compress_pack_payload(&data, compression)?;
274                    (compressed, obj_type, None, 0)
275                };
276
277            *total_compressed += final_data.len() as u64;
278
279            Self::write_entry_parts(
280                pack_data,
281                PackObjectId::Hash(hash),
282                entry_type,
283                data.len(),
284                base_hash.map(PackObjectId::Hash),
285                &final_data,
286            )?;
287
288            // Add to window (build index once, reuse for all future comparisons)
289            let entry_index = DeltaEncoder::build_index(&data);
290            if window.len() >= WINDOW_SIZE {
291                window.pop_front();
292            }
293            window.push_back(WindowEntry {
294                hash,
295                data,
296                index: entry_index,
297                chain_depth,
298            });
299        }
300
301        Ok(())
302    }
303
304    fn group_by_type(objects: Vec<PackObjectRecord>) -> GroupedPackMap {
305        let mut groups: GroupedPackMap = HashMap::new();
306
307        for record in objects {
308            groups.entry(record.obj_type).or_default().push(record);
309        }
310
311        groups
312    }
313
314    /// Write a pack entry with varint-encoded sizes.
315    ///
316    /// Format per entry:
317    /// ```text
318    /// [tagged_id][type+uncompressed_size: varint][compressed_size: varint]
319    /// [tagged_base_id (delta only)][compressed_data]
320    /// ```
321    ///
322    /// The compressed data is raw zstd — no wrapper header — since the
323    /// entry already records both sizes.
324    fn write_entry(
325        pack: &mut Vec<u8>,
326        record: &PackObjectRecord,
327        obj_type: ObjectType,
328        compressed: &[u8],
329    ) -> Result<()> {
330        encode_tagged_entry(pack, record, obj_type, compressed)
331    }
332
333    fn write_entry_parts(
334        pack: &mut Vec<u8>,
335        id: PackObjectId,
336        obj_type: ObjectType,
337        uncompressed_size: usize,
338        delta_base: Option<PackObjectId>,
339        compressed: &[u8],
340    ) -> Result<()> {
341        encode_tagged_entry_parts(
342            pack,
343            id,
344            obj_type,
345            uncompressed_size,
346            delta_base,
347            compressed,
348        )
349    }
350}