Skip to main content

objects/store/pack/
pack_builder.rs

1// SPDX-License-Identifier: Apache-2.0
2//! Pack builder for creating packfiles.
3
4use std::collections::{HashMap, VecDeque};
5
6use super::{
7    ObjectType, PackObjectId, PackObjectRecord, PackStats, append_container_checksum,
8    compress_pack_payload, encode_tagged_entry, encode_tagged_entry_parts, pack_container_spec,
9    pack_index::PackIndex, write_container_header,
10};
11use crate::{
12    delta::DeltaEncoder,
13    object::ContentHash,
14    store::{Result, compression::CompressionConfig},
15};
16
17const MIN_DELTA_SIZE: usize = 64;
18/// Maximum depth for delta chains (matches Git's default).
19const MAX_DELTA_CHAIN_DEPTH: usize = 50;
20/// Number of recent objects to try as delta bases (Git default: 10).
21const WINDOW_SIZE: usize = 10;
22
23type GroupedPackMap = HashMap<ObjectType, Vec<PackObjectRecord>>;
24
25/// Pack builder for creating packfiles.
26pub struct PackBuilder {
27    objects: Vec<PackObjectRecord>,
28    compression: CompressionConfig,
29}
30
31/// A recent object in the sliding window, with cached hash index for fast delta estimation.
32struct WindowEntry {
33    hash: ContentHash,
34    data: Vec<u8>,
35    index: HashMap<[u8; 4], Vec<usize>>,
36    chain_depth: usize,
37}
38
39impl PackBuilder {
40    /// Create a new pack builder.
41    pub fn new(compression: CompressionConfig) -> Self {
42        Self {
43            objects: Vec::new(),
44            compression,
45        }
46    }
47
48    /// Add an object to the pack.
49    pub fn add(&mut self, hash: ContentHash, obj_type: ObjectType, data: Vec<u8>) {
50        self.add_id(PackObjectId::Hash(hash), obj_type, data);
51    }
52
53    pub fn add_id(&mut self, id: PackObjectId, obj_type: ObjectType, data: Vec<u8>) {
54        self.objects.push(PackObjectRecord {
55            id,
56            obj_type,
57            data,
58            delta_base: None,
59            path_hint: None,
60        });
61    }
62
63    /// Add an object with a path hint for better delta grouping.
64    ///
65    /// Objects sharing the same path (e.g. successive versions of `src/main.rs`)
66    /// will be sorted together for delta encoding, producing much better
67    /// compression ratios than size-only ordering.
68    pub fn add_with_path(
69        &mut self,
70        hash: ContentHash,
71        obj_type: ObjectType,
72        data: Vec<u8>,
73        path: Option<String>,
74    ) {
75        self.add_with_path_id(PackObjectId::Hash(hash), obj_type, data, path);
76    }
77
78    pub fn add_with_path_id(
79        &mut self,
80        id: PackObjectId,
81        obj_type: ObjectType,
82        data: Vec<u8>,
83        path: Option<String>,
84    ) {
85        self.objects.push(PackObjectRecord {
86            id,
87            obj_type,
88            data,
89            delta_base: None,
90            path_hint: path,
91        });
92    }
93
94    /// Build the packfile and index.
95    ///
96    /// Returns the pack data, index data, and statistics.
97    pub fn build(self) -> Result<(Vec<u8>, Vec<u8>, PackStats)> {
98        let mut pack_data = Vec::new();
99        let mut index = PackIndex::new();
100
101        write_container_header(
102            &mut pack_data,
103            pack_container_spec(),
104            self.objects.len() as u64,
105        );
106
107        let mut total_uncompressed = 0u64;
108        let mut total_compressed = 0u64;
109        let mut delta_count = 0u64;
110
111        let object_count = self.objects.len() as u64;
112        let grouped = Self::group_by_type(self.objects);
113
114        for (obj_type, mut objects) in grouped {
115            if objects.len() < 2
116                || obj_type == ObjectType::State
117                || self.compression.max_delta_size == 0
118            {
119                // Single objects, states, or transfer-tuned packs with delta disabled:
120                // write entries directly without running the sliding delta search.
121                for record in objects {
122                    let offset = pack_data.len() as u64;
123                    index.add(record.id, offset);
124
125                    total_uncompressed += record.data.len() as u64;
126                    let compressed = compress_pack_payload(&record.data, &self.compression)?;
127                    total_compressed += compressed.len() as u64;
128
129                    Self::write_entry(&mut pack_data, &record, obj_type, &compressed)?;
130                }
131            } else {
132                Self::sort_for_delta_window(&mut objects);
133                Self::encode_with_sliding_window(
134                    &mut pack_data,
135                    &mut index,
136                    &mut total_uncompressed,
137                    &mut total_compressed,
138                    &mut delta_count,
139                    obj_type,
140                    objects,
141                    &self.compression,
142                )?;
143            }
144        }
145
146        index.sort();
147
148        append_container_checksum(&mut pack_data);
149
150        let stats = PackStats {
151            object_count,
152            total_uncompressed,
153            total_compressed,
154            delta_count,
155            compression_ratio: total_compressed as f64 / total_uncompressed as f64,
156        };
157
158        Ok((pack_data, index.to_bytes(), stats))
159    }
160
161    /// Sort objects for optimal delta window traversal.
162    ///
163    /// Sorts by: file extension → basename → size descending.
164    /// This ensures files with the same extension are adjacent (like Git sorting
165    /// `.rs` files together), within that group same-named files are adjacent,
166    /// and within that the largest comes first (best delta base candidate).
167    fn sort_for_delta_window(objects: &mut [PackObjectRecord]) {
168        objects.sort_by(|a, b| {
169            let key_a = Self::sort_key(&a.path_hint);
170            let key_b = Self::sort_key(&b.path_hint);
171            key_a.cmp(&key_b).then(b.data.len().cmp(&a.data.len()))
172        });
173    }
174
175    /// Extract a sort key from a path: (extension, basename_without_extension).
176    /// Objects without paths sort last.
177    fn sort_key(path: &Option<String>) -> (String, String) {
178        match path {
179            Some(p) => {
180                let filename = p.rsplit('/').next().unwrap_or(p);
181                if let Some(dot_pos) = filename.rfind('.') {
182                    let ext = filename[dot_pos + 1..].to_string();
183                    let stem = filename[..dot_pos].to_string();
184                    (ext, stem)
185                } else {
186                    (String::new(), filename.to_string())
187                }
188            }
189            None => ("\u{FFFF}".to_string(), String::new()),
190        }
191    }
192
193    /// Encode objects using a sliding window for delta base selection.
194    ///
195    /// For each object, tries delta encoding against the W most recent objects
196    /// in the window, picking the base that produces the smallest delta. This
197    /// is the same approach Git uses with `--window=10`.
198    #[allow(clippy::too_many_arguments)]
199    fn encode_with_sliding_window(
200        pack_data: &mut Vec<u8>,
201        index: &mut PackIndex,
202        total_uncompressed: &mut u64,
203        total_compressed: &mut u64,
204        delta_count: &mut u64,
205        obj_type: ObjectType,
206        objects: Vec<PackObjectRecord>,
207        compression: &CompressionConfig,
208    ) -> Result<()> {
209        let mut window: VecDeque<WindowEntry> = VecDeque::with_capacity(WINDOW_SIZE);
210
211        for record in objects {
212            let hash = match record.id {
213                PackObjectId::Hash(hash) => hash,
214                PackObjectId::ChangeId(_) => {
215                    let offset = pack_data.len() as u64;
216                    index.add(record.id, offset);
217                    *total_uncompressed += record.data.len() as u64;
218                    let compressed = compress_pack_payload(&record.data, compression)?;
219                    *total_compressed += compressed.len() as u64;
220                    Self::write_entry(pack_data, &record, obj_type, &compressed)?;
221                    continue;
222                }
223            };
224            let data = record.data;
225            let offset = pack_data.len() as u64;
226            index.add(PackObjectId::Hash(hash), offset);
227            *total_uncompressed += data.len() as u64;
228
229            // Try delta against each window entry, pick the best
230            let mut best_base_idx: Option<usize> = None;
231            let mut best_delta_estimate = usize::MAX;
232
233            if data.len() >= MIN_DELTA_SIZE {
234                for (i, entry) in window.iter().enumerate() {
235                    // Skip if this base is already at max chain depth
236                    if entry.chain_depth >= MAX_DELTA_CHAIN_DEPTH {
237                        continue;
238                    }
239                    // Skip if base is too small
240                    if entry.data.len() < MIN_DELTA_SIZE {
241                        continue;
242                    }
243
244                    let estimate = DeltaEncoder::estimate_delta_size_with_index(
245                        &entry.index,
246                        &entry.data,
247                        &data,
248                    );
249
250                    if estimate < best_delta_estimate {
251                        best_delta_estimate = estimate;
252                        best_base_idx = Some(i);
253                    }
254                }
255            }
256
257            // Decide: delta or raw?
258            let (final_data, entry_type, base_hash, chain_depth) =
259                if let Some(base_idx) = best_base_idx {
260                    let base_entry = &window[base_idx];
261                    let delta =
262                        DeltaEncoder::encode_with_index(&base_entry.index, &base_entry.data, &data);
263                    let delta_compressed = compress_pack_payload(&delta, compression)?;
264
265                    if delta_compressed.len() < data.len() {
266                        *delta_count += 1;
267                        let bh = base_entry.hash;
268                        let depth = base_entry.chain_depth + 1;
269                        (delta_compressed, ObjectType::Delta, Some(bh), depth)
270                    } else {
271                        let compressed = compress_pack_payload(&data, compression)?;
272                        (compressed, obj_type, None, 0)
273                    }
274                } else {
275                    let compressed = compress_pack_payload(&data, compression)?;
276                    (compressed, obj_type, None, 0)
277                };
278
279            *total_compressed += final_data.len() as u64;
280
281            Self::write_entry_parts(
282                pack_data,
283                PackObjectId::Hash(hash),
284                entry_type,
285                data.len(),
286                base_hash.map(PackObjectId::Hash),
287                &final_data,
288            )?;
289
290            // Add to window (build index once, reuse for all future comparisons)
291            let entry_index = DeltaEncoder::build_index(&data);
292            if window.len() >= WINDOW_SIZE {
293                window.pop_front();
294            }
295            window.push_back(WindowEntry {
296                hash,
297                data,
298                index: entry_index,
299                chain_depth,
300            });
301        }
302
303        Ok(())
304    }
305
306    fn group_by_type(objects: Vec<PackObjectRecord>) -> GroupedPackMap {
307        let mut groups: GroupedPackMap = HashMap::new();
308
309        for record in objects {
310            groups.entry(record.obj_type).or_default().push(record);
311        }
312
313        groups
314    }
315
316    /// Write a pack entry with varint-encoded sizes.
317    ///
318    /// Format per entry:
319    /// ```text
320    /// [tagged_id][type+uncompressed_size: varint][compressed_size: varint]
321    /// [tagged_base_id (delta only)][compressed_data]
322    /// ```
323    ///
324    /// The compressed data is raw zstd — no wrapper header — since the
325    /// entry already records both sizes.
326    fn write_entry(
327        pack: &mut Vec<u8>,
328        record: &PackObjectRecord,
329        obj_type: ObjectType,
330        compressed: &[u8],
331    ) -> Result<()> {
332        encode_tagged_entry(pack, record, obj_type, compressed)
333    }
334
335    fn write_entry_parts(
336        pack: &mut Vec<u8>,
337        id: PackObjectId,
338        obj_type: ObjectType,
339        uncompressed_size: usize,
340        delta_base: Option<PackObjectId>,
341        compressed: &[u8],
342    ) -> Result<()> {
343        encode_tagged_entry_parts(
344            pack,
345            id,
346            obj_type,
347            uncompressed_size,
348            delta_base,
349            compressed,
350        )
351    }
352}