Skip to main content

gix_pack/data/file/decode/
entry.rs

1use std::ops::Range;
2
3use gix_features::zlib;
4use smallvec::SmallVec;
5
6use crate::{
7    cache, data,
8    data::{File, delta, file::decode::Error},
9};
10
11/// A return value of a resolve function, which given an [`ObjectId`][gix_hash::ObjectId] determines where an object can be found.
12#[derive(Debug, PartialEq, Eq, Hash, Ord, PartialOrd, Clone)]
13#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
14pub enum ResolvedBase {
15    /// Indicate an object is within this pack, at the given entry, and thus can be looked up locally.
16    InPack(data::Entry),
17    /// Indicates the object of `kind` was found outside of the pack, and its data was written into an output
18    /// vector which now has a length of `end`.
19    #[allow(missing_docs)]
20    OutOfPack { kind: gix_object::Kind, end: usize },
21}
22
23#[derive(Debug)]
24struct Delta {
25    data: Range<usize>,
26    base_size: usize,
27    result_size: usize,
28
29    decompressed_size: usize,
30    data_offset: data::Offset,
31}
32
33/// Additional information and statistics about a successfully decoded object produced by [`File::decode_entry()`].
34///
35/// Useful to understand the effectiveness of the pack compression or the cost of decompression.
36#[derive(Debug, PartialEq, Eq, Hash, Ord, PartialOrd, Clone)]
37#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
38pub struct Outcome {
39    /// The kind of resolved object.
40    pub kind: gix_object::Kind,
41    /// The amount of deltas in the chain of objects that had to be resolved beforehand.
42    ///
43    /// This number is affected by the [`Cache`][cache::DecodeEntry] implementation, with cache hits shortening the
44    /// delta chain accordingly
45    pub num_deltas: u32,
46    /// The total decompressed size of all pack entries in the delta chain
47    pub decompressed_size: u64,
48    /// The total compressed size of all pack entries in the delta chain
49    pub compressed_size: usize,
50    /// The total size of the decoded object.
51    pub object_size: u64,
52}
53
54impl Outcome {
55    pub(crate) fn default_from_kind(kind: gix_object::Kind) -> Self {
56        Self {
57            kind,
58            num_deltas: 0,
59            decompressed_size: 0,
60            compressed_size: 0,
61            object_size: 0,
62        }
63    }
64    fn from_object_entry(kind: gix_object::Kind, entry: &data::Entry, compressed_size: usize) -> Self {
65        Self {
66            kind,
67            num_deltas: 0,
68            decompressed_size: entry.decompressed_size,
69            compressed_size,
70            object_size: entry.decompressed_size,
71        }
72    }
73}
74
75/// Decompression of objects
76impl<T> File<T>
77where
78    T: crate::FileData,
79{
80    fn decoded_object_size(&self, size: u64) -> Result<usize, Error> {
81        decoded_object_size(size, self.alloc_limit_bytes)
82    }
83
84    /// Decompress the given `entry` into `out` and return the amount of bytes read from the pack data.
85    /// Note that `inflate` is not reset after usage, but will be reset before using it.
86    ///
87    /// _Note_ that this method does not resolve deltified objects, but merely decompresses their content
88    /// `out` is expected to be large enough to hold `entry.size` bytes.
89    pub fn decompress_entry(
90        &self,
91        entry: &data::Entry,
92        inflate: &mut zlib::Inflate,
93        out: &mut [u8],
94    ) -> Result<usize, Error> {
95        let size: usize = entry.decompressed_size.try_into().map_err(|_| Error::OutOfMemory)?;
96        if out.len() < size {
97            return Err(Error::OutOfMemory);
98        }
99        self.decompress_entry_from_data_offset(entry.data_offset, inflate, &mut out[..size])
100    }
101
102    /// Obtain the [`Entry`][crate::data::Entry] at the given `offset` into the pack.
103    ///
104    /// The `offset` is typically obtained from the pack index file.
105    pub fn entry(&self, offset: data::Offset) -> Result<data::Entry, data::entry::decode::Error> {
106        let pack_offset: usize = offset.try_into().expect("offset representable by machine");
107        if pack_offset > self.data.len() {
108            return Err(data::entry::decode::Error::Corrupt {
109                message: "an entry offset pointing beyond pack data",
110            });
111        }
112
113        let object_data = &self.data[pack_offset..];
114        data::Entry::from_bytes(object_data, offset, self.hash_len)
115    }
116
117    /// Decompress the object expected at the given data offset, sans pack header. This information is only
118    /// known after the pack header was parsed.
119    /// Note that this method does not resolve deltified objects, but merely decompresses their content
120    /// `out` is expected to be large enough to hold `entry.size` bytes.
121    /// Returns the amount of packed bytes there read from the pack data file.
122    pub(crate) fn decompress_entry_from_data_offset(
123        &self,
124        data_offset: data::Offset,
125        inflate: &mut zlib::Inflate,
126        out: &mut [u8],
127    ) -> Result<usize, Error> {
128        let (consumed_in, _consumed_out) =
129            self.decompress_complete_entry_from_data_offset(data_offset, inflate, out)?;
130        Ok(consumed_in)
131    }
132
133    /// Like `decompress_entry_from_data_offset`, but returns `(consumed-input, consumed-output)`.
134    ///
135    /// The compressed stream must end exactly after producing `out.len()` bytes. Pack entry
136    /// headers are untrusted, so callers must not accept streams that stop early and
137    /// leave zero-filled slack in the destination buffer, nor streams that require more output
138    /// than the header promised. Both cases would make later delta parsing operate on bytes that
139    /// are not the entry payload described by the pack header.
140    pub(crate) fn decompress_complete_entry_from_data_offset(
141        &self,
142        data_offset: data::Offset,
143        inflate: &mut zlib::Inflate,
144        out: &mut [u8],
145    ) -> Result<(usize, usize), Error> {
146        let (status, consumed_in, consumed_out) =
147            self.decompress_entry_from_data_offset_unchecked(data_offset, inflate, out)?;
148        if status != zlib::Status::StreamEnd || consumed_out != out.len() {
149            return Err(data::entry::decode::Error::Corrupt {
150                message: "pack entry decompressed size does not match entry header",
151            }
152            .into());
153        }
154        Ok((consumed_in, consumed_out))
155    }
156
157    /// Like [`Self::decompress_commplete_entry_from_data_offset()`], but allows callers to inspect incomplete streams.
158    ///
159    /// This is only for callers that intentionally decompress a prefix into a smaller buffer, such as
160    /// delta header probing. Full pack entry decoding should use [`Self::decompress_commplete_entry_from_data_offset()`].
161    pub(crate) fn decompress_entry_from_data_offset_unchecked(
162        &self,
163        data_offset: data::Offset,
164        inflate: &mut zlib::Inflate,
165        out: &mut [u8],
166    ) -> Result<(zlib::Status, usize, usize), Error> {
167        let offset: usize = data_offset.try_into().expect("offset representable by machine");
168        if offset >= self.data.len() {
169            return Err(data::entry::decode::Error::Corrupt {
170                message: "an entry data offset pointing beyond pack data",
171            }
172            .into());
173        }
174
175        inflate.reset();
176        inflate.once(&self.data[offset..], out).map_err(Into::into)
177    }
178
179    /// Decode an entry, resolving delta's as needed, while growing the `out` vector if there is not enough
180    /// space to hold the result object.
181    ///
182    /// The `entry` determines which object to decode, and is commonly obtained with the help of a pack index file or through pack iteration.
183    /// `inflate` will be used for decompressing entries, and will not be reset after usage, but before first using it.
184    ///
185    /// `resolve` is a function to lookup objects with the given [`ObjectId`][gix_hash::ObjectId], in case the full object id is used to refer to
186    /// a base object, instead of an in-pack offset.
187    ///
188    /// `delta_cache` is a mechanism to avoid looking up base objects multiple times when decompressing multiple objects in a row.
189    /// Use a [Noop-Cache][cache::Never] to disable caching all together at the cost of repeating work.
190    pub fn decode_entry(
191        &self,
192        entry: data::Entry,
193        out: &mut Vec<u8>,
194        inflate: &mut zlib::Inflate,
195        resolve: &dyn Fn(&gix_hash::oid, &mut Vec<u8>) -> Option<ResolvedBase>,
196        delta_cache: &mut dyn cache::DecodeEntry,
197    ) -> Result<Outcome, Error> {
198        use crate::data::entry::Header::*;
199        match entry.header {
200            Tree | Blob | Commit | Tag => {
201                let size = self.decoded_object_size(entry.decompressed_size)?;
202                if let Some(additional) = size.checked_sub(out.len()) {
203                    out.try_reserve(additional)?;
204                }
205                out.resize(size, 0);
206                self.decompress_entry(&entry, inflate, out.as_mut_slice())
207                    .map(|consumed_input| {
208                        Outcome::from_object_entry(
209                            entry.header.as_kind().expect("a non-delta entry"),
210                            &entry,
211                            consumed_input,
212                        )
213                    })
214            }
215            OfsDelta { .. } | RefDelta { .. } => self.resolve_deltas(entry, resolve, inflate, out, delta_cache),
216        }
217    }
218
219    /// resolve: technically, this shouldn't ever be required as stored local packs don't refer to objects by id
220    /// that are outside of the pack. Unless, of course, the ref refers to an object within this pack, which means
221    /// it's very, very large as 20bytes are smaller than the corresponding MSB encoded number
222    fn resolve_deltas(
223        &self,
224        last: data::Entry,
225        resolve: &dyn Fn(&gix_hash::oid, &mut Vec<u8>) -> Option<ResolvedBase>,
226        inflate: &mut zlib::Inflate,
227        out: &mut Vec<u8>,
228        cache: &mut dyn cache::DecodeEntry,
229    ) -> Result<Outcome, Error> {
230        // all deltas, from the one that produces the desired object (first) to the oldest at the end of the chain
231        let mut chain = SmallVec::<[Delta; 10]>::default();
232        let first_entry = last.clone();
233        let mut cursor = last;
234        let mut base_buffer_size: Option<usize> = None;
235        let mut object_kind: Option<gix_object::Kind> = None;
236        let mut consumed_input: Option<usize> = None;
237
238        // Find the first full base, either an undeltified object in the pack or a reference to another object.
239        let mut total_delta_data_size: u64 = 0;
240        while cursor.header.is_delta() {
241            if let Some((kind, packed_size)) = cache.get(self.id, cursor.data_offset, out) {
242                base_buffer_size = Some(out.len());
243                object_kind = Some(kind);
244                // If the input entry is a cache hit, keep the packed size as it must be returned.
245                // Otherwise, the packed size will be determined later when decompressing the input delta
246                if total_delta_data_size == 0 {
247                    consumed_input = Some(packed_size);
248                }
249                break;
250            }
251            // This is a pessimistic guess, as worst possible compression should not be bigger than the data itself.
252            // TODO: is this assumption actually true?
253            total_delta_data_size = total_delta_data_size
254                .checked_add(cursor.decompressed_size)
255                .ok_or(Error::OutOfMemory)?;
256            let decompressed_size = self.decoded_object_size(cursor.decompressed_size)?;
257            chain.push(Delta {
258                data: Range {
259                    start: 0,
260                    end: decompressed_size,
261                },
262                base_size: 0,
263                result_size: 0,
264                decompressed_size,
265                data_offset: cursor.data_offset,
266            });
267            use crate::data::entry::Header;
268            cursor = match cursor.header {
269                Header::OfsDelta { base_distance } => {
270                    self.entry(cursor.checked_base_pack_offset(base_distance).ok_or(
271                        crate::data::entry::decode::Error::Corrupt {
272                            message: "an ofs-delta base distance pointing before pack start",
273                        },
274                    )?)?
275                }
276                Header::RefDelta { base_id } => match resolve(base_id.as_ref(), out) {
277                    Some(ResolvedBase::InPack(entry)) => entry,
278                    Some(ResolvedBase::OutOfPack { end, kind }) => {
279                        base_buffer_size = Some(end);
280                        object_kind = Some(kind);
281                        break;
282                    }
283                    None => return Err(Error::DeltaBaseUnresolved(base_id)),
284                },
285                _ => unreachable!("cursor.is_delta() only allows deltas here"),
286            };
287        }
288
289        // This can happen if the cache held the first entry itself
290        // We will just treat it as an object then, even though it's technically incorrect.
291        if chain.is_empty() {
292            return Ok(Outcome::from_object_entry(
293                object_kind.expect("object kind as set by cache"),
294                &first_entry,
295                consumed_input.expect("consumed bytes as set by cache"),
296            ));
297        }
298
299        // First pass will decompress all delta data and keep it in our output buffer
300        // [<possibly resolved base object>]<delta-1..delta-n>...
301        // so that we can find the biggest result size.
302        let total_delta_data_size: usize = total_delta_data_size.try_into().map_err(|_| Error::OutOfMemory)?;
303
304        let chain_len = chain.len();
305        let (first_buffer_end, second_buffer_end) = {
306            let delta_start = base_buffer_size.unwrap_or(0);
307
308            let delta_range = Range {
309                start: delta_start,
310                end: delta_start
311                    .checked_add(total_delta_data_size)
312                    .ok_or(Error::OutOfMemory)?,
313            };
314            out.try_reserve(delta_range.end.saturating_sub(out.len()))?;
315            out.resize(delta_range.end, 0);
316
317            let mut instructions = &mut out[delta_range.clone()];
318            let mut relative_delta_start = 0;
319            let mut biggest_result_size = 0;
320            for (delta_idx, delta) in chain.iter_mut().rev().enumerate() {
321                let (consumed_from_data_offset, consumed_out) = self.decompress_complete_entry_from_data_offset(
322                    delta.data_offset,
323                    inflate,
324                    &mut instructions[..delta.decompressed_size],
325                )?;
326                let is_last_delta_to_be_applied = delta_idx + 1 == chain_len;
327                if is_last_delta_to_be_applied {
328                    consumed_input = Some(consumed_from_data_offset);
329                }
330
331                let current_delta = &instructions[..consumed_out];
332                let (base_size, offset) = delta::decode_header_size(current_delta)?;
333                let mut bytes_consumed_by_header = offset;
334                biggest_result_size = biggest_result_size.max(base_size);
335                delta.base_size = self.decoded_object_size(base_size)?;
336
337                let (result_size, offset) = delta::decode_header_size(&current_delta[offset..])?;
338                bytes_consumed_by_header += offset;
339                biggest_result_size = biggest_result_size.max(result_size);
340                delta.result_size = self.decoded_object_size(result_size)?;
341
342                // the absolute location into the instructions buffer, so we keep track of the end point of the last
343                delta.data.start = relative_delta_start + bytes_consumed_by_header;
344                delta.data.end = relative_delta_start + consumed_out;
345                relative_delta_start += delta.decompressed_size;
346
347                instructions = &mut instructions[delta.decompressed_size..];
348            }
349
350            // Now we can produce a buffer like this
351            // [<biggest-result-buffer, possibly filled with resolved base object data>]<biggest-result-buffer><delta-1..delta-n>
352            // from [<possibly resolved base object>]<delta-1..delta-n>...
353            if base_buffer_size.is_none() {
354                biggest_result_size = biggest_result_size.max(cursor.decompressed_size);
355            }
356            let biggest_result_size = self.decoded_object_size(biggest_result_size)?;
357            let first_buffer_size = biggest_result_size;
358            let second_buffer_size = first_buffer_size;
359            let out_size = first_buffer_size
360                .checked_add(second_buffer_size)
361                .and_then(|size| size.checked_add(total_delta_data_size))
362                .ok_or(Error::OutOfMemory)?;
363            out.try_reserve(out_size.saturating_sub(out.len()))?;
364            out.resize(out_size, 0);
365
366            // Now 'rescue' the deltas, because in the next step we possibly overwrite that portion
367            // of memory with the base object (in the majority of cases)
368            let second_buffer_end = {
369                let end = first_buffer_size
370                    .checked_add(second_buffer_size)
371                    .ok_or(Error::OutOfMemory)?;
372                // Move the decompressed delta instructions behind the two work buffers so they remain intact
373                // while we repurpose the front of `out` for base-object materialization and delta application.
374                out.copy_within(delta_range, end);
375                end
376            };
377
378            // If we don't have a out-of-pack object already, fill the base-buffer by decompressing the full object
379            // at which the cursor is left after the iteration
380            if base_buffer_size.is_none() {
381                let base_entry = cursor;
382                debug_assert!(!base_entry.header.is_delta());
383                object_kind = base_entry.header.as_kind();
384                let base_size = self.decoded_object_size(base_entry.decompressed_size)?;
385                let out_base = &mut out[..base_size];
386                self.decompress_entry_from_data_offset(base_entry.data_offset, inflate, out_base)?;
387            }
388
389            (first_buffer_size, second_buffer_end)
390        };
391
392        // From oldest to most recent, apply all deltas, swapping the buffer back and forth
393        // TODO: once we have more tests, we could optimize this memory-intensive work to
394        //       analyse the delta-chains to only copy data once - after all, with 'copy-from-base' deltas,
395        //       all data originates from one base at some point.
396        // `out` is: [source-buffer][target-buffer][max-delta-instructions-buffer]
397        let (buffers, instructions) = out.split_at_mut(second_buffer_end);
398        let (mut source_buf, mut target_buf) = buffers.split_at_mut(first_buffer_end);
399
400        let mut last_result_size = None;
401        for (
402            delta_idx,
403            Delta {
404                data,
405                base_size,
406                result_size,
407                ..
408            },
409        ) in chain.into_iter().rev().enumerate()
410        {
411            let data = &mut instructions[data];
412            if delta_idx + 1 == chain_len {
413                last_result_size = Some(result_size);
414            }
415            delta::apply(&source_buf[..base_size], &mut target_buf[..result_size], data)?;
416            // use the target as source for the next delta
417            std::mem::swap(&mut source_buf, &mut target_buf);
418        }
419
420        let last_result_size = last_result_size.expect("at least one delta chain item");
421        // uneven chains leave the target buffer after the source buffer
422        // FIXME(Performance) If delta-chains are uneven, we know we will have to copy bytes over here
423        //      Instead we could use a different start buffer, to naturally end up with the result in the
424        //      right one.
425        //      However, this is a bit more complicated than just that - you have to deal with the base
426        //      object, which should also be placed in the second buffer right away. You don't have that
427        //      control/knowledge for out-of-pack bases, so this is a special case to deal with, too.
428        //      Maybe these invariants can be represented in the type system though.
429        if chain_len % 2 == 1 {
430            // this seems inverted, but remember: we swapped the buffers on the last iteration
431            target_buf[..last_result_size].copy_from_slice(&source_buf[..last_result_size]);
432        }
433        debug_assert!(out.len() >= last_result_size);
434        out.truncate(last_result_size);
435
436        let object_kind = object_kind.expect("a base object as root of any delta chain that we are here to resolve");
437        let consumed_input = consumed_input.expect("at least one decompressed delta object");
438        cache.put(
439            self.id,
440            first_entry.data_offset,
441            out.as_slice(),
442            object_kind,
443            consumed_input,
444        );
445        Ok(Outcome {
446            kind: object_kind,
447            // technically depending on the cache, the chain size is not correct as it might
448            // have been cut short by a cache hit. The caller must deactivate the cache to get
449            // actual results
450            num_deltas: chain_len as u32,
451            decompressed_size: first_entry.decompressed_size,
452            compressed_size: consumed_input,
453            object_size: last_result_size as u64,
454        })
455    }
456}
457
458/// Convert user-controlled sizes from pack data into allocation sizes while enforcing the configured allocation cap.
459fn decoded_object_size(size: u64, alloc_limit_bytes: Option<usize>) -> Result<usize, Error> {
460    let size: usize = size.try_into().map_err(|_| Error::OutOfMemory)?;
461    if alloc_limit_bytes.is_some_and(|limit| size > limit) {
462        return Err(Error::OutOfMemory);
463    }
464    Ok(size)
465}
466
467#[cfg(test)]
468mod tests {
469    use gix_testtools::size_ok;
470
471    use super::*;
472
473    #[test]
474    fn size_of_decode_entry_outcome() {
475        let actual = std::mem::size_of::<Outcome>();
476        let expected = 32;
477        assert!(
478            size_ok(actual, expected),
479            "this shouldn't change without use noticing as it's returned a lot: {actual} <~ {expected}"
480        );
481    }
482}