git_pack/data/file/decode/
entry.rs

1use std::{convert::TryInto, ops::Range};
2
3use git_features::zlib;
4use smallvec::SmallVec;
5
6use crate::{
7    cache, data,
8    data::{delta, file::decode::Error, File},
9};
10
11/// A return value of a resolve function, which given an [`ObjectId`][git_hash::ObjectId] determines where an object can be found.
12#[derive(Debug, PartialEq, Eq, Hash, Ord, PartialOrd, Clone)]
13#[cfg_attr(feature = "serde1", derive(serde::Serialize, serde::Deserialize))]
14pub enum ResolvedBase {
15    /// Indicate an object is within this pack, at the given entry, and thus can be looked up locally.
16    InPack(data::Entry),
17    /// Indicates the object of `kind` was found outside of the pack, and its data was written into an output
18    /// vector which now has a length of `end`.
19    #[allow(missing_docs)]
20    OutOfPack { kind: git_object::Kind, end: usize },
21}
22
23#[derive(Debug)]
24struct Delta {
25    data: Range<usize>,
26    base_size: usize,
27    result_size: usize,
28
29    decompressed_size: usize,
30    data_offset: data::Offset,
31}
32
33/// Additional information and statistics about a successfully decoded object produced by [`File::decode_entry()`].
34///
35/// Useful to understand the effectiveness of the pack compression or the cost of decompression.
36#[derive(Debug, PartialEq, Eq, Hash, Ord, PartialOrd, Clone)]
37#[cfg_attr(feature = "serde1", derive(serde::Serialize, serde::Deserialize))]
38pub struct Outcome {
39    /// The kind of resolved object.
40    pub kind: git_object::Kind,
41    /// The amount of deltas in the chain of objects that had to be resolved beforehand.
42    ///
43    /// This number is affected by the [`Cache`][cache::DecodeEntry] implementation, with cache hits shortening the
44    /// delta chain accordingly
45    pub num_deltas: u32,
46    /// The total decompressed size of all pack entries in the delta chain
47    pub decompressed_size: u64,
48    /// The total compressed size of all pack entries in the delta chain
49    pub compressed_size: usize,
50    /// The total size of the decoded object.
51    pub object_size: u64,
52}
53
54impl Outcome {
55    pub(crate) fn default_from_kind(kind: git_object::Kind) -> Self {
56        Self {
57            kind,
58            num_deltas: 0,
59            decompressed_size: 0,
60            compressed_size: 0,
61            object_size: 0,
62        }
63    }
64    fn from_object_entry(kind: git_object::Kind, entry: &data::Entry, compressed_size: usize) -> Self {
65        Self {
66            kind,
67            num_deltas: 0,
68            decompressed_size: entry.decompressed_size,
69            compressed_size,
70            object_size: entry.decompressed_size,
71        }
72    }
73}
74
75/// Decompression of objects
76impl File {
77    /// Decompress the given `entry` into `out` and return the amount of bytes read from the pack data.
78    ///
79    /// _Note_ that this method does not resolve deltified objects, but merely decompresses their content
80    /// `out` is expected to be large enough to hold `entry.size` bytes.
81    ///
82    /// # Panics
83    ///
84    /// If `out` isn't large enough to hold the decompressed `entry`
85    pub fn decompress_entry(&self, entry: &data::Entry, out: &mut [u8]) -> Result<usize, Error> {
86        assert!(
87            out.len() as u64 >= entry.decompressed_size,
88            "output buffer isn't large enough to hold decompressed result, want {}, have {}",
89            entry.decompressed_size,
90            out.len()
91        );
92
93        self.decompress_entry_from_data_offset(entry.data_offset, out)
94            .map_err(Into::into)
95    }
96
97    fn assure_v2(&self) {
98        assert!(
99            matches!(self.version, crate::data::Version::V2),
100            "Only V2 is implemented"
101        );
102    }
103
104    /// Obtain the [`Entry`][crate::data::Entry] at the given `offset` into the pack.
105    ///
106    /// The `offset` is typically obtained from the pack index file.
107    pub fn entry(&self, offset: data::Offset) -> data::Entry {
108        self.assure_v2();
109        let pack_offset: usize = offset.try_into().expect("offset representable by machine");
110        assert!(pack_offset <= self.data.len(), "offset out of bounds");
111
112        let object_data = &self.data[pack_offset..];
113        data::Entry::from_bytes(object_data, offset, self.hash_len)
114    }
115
116    /// Decompress the object expected at the given data offset, sans pack header. This information is only
117    /// known after the pack header was parsed.
118    /// Note that this method does not resolve deltified objects, but merely decompresses their content
119    /// `out` is expected to be large enough to hold `entry.size` bytes.
120    /// Returns the amount of packed bytes there read from the pack data file.
121    pub(crate) fn decompress_entry_from_data_offset(
122        &self,
123        data_offset: data::Offset,
124        out: &mut [u8],
125    ) -> Result<usize, zlib::inflate::Error> {
126        let offset: usize = data_offset.try_into().expect("offset representable by machine");
127        assert!(offset < self.data.len(), "entry offset out of bounds");
128
129        zlib::Inflate::default()
130            .once(&self.data[offset..], out)
131            .map(|(_status, consumed_in, _consumed_out)| consumed_in)
132    }
133
134    /// Like `decompress_entry_from_data_offset`, but returns consumed input and output.
135    pub(crate) fn decompress_entry_from_data_offset_2(
136        &self,
137        data_offset: data::Offset,
138        out: &mut [u8],
139    ) -> Result<(usize, usize), zlib::inflate::Error> {
140        let offset: usize = data_offset.try_into().expect("offset representable by machine");
141        assert!(offset < self.data.len(), "entry offset out of bounds");
142
143        zlib::Inflate::default()
144            .once(&self.data[offset..], out)
145            .map(|(_status, consumed_in, consumed_out)| (consumed_in, consumed_out))
146    }
147
148    /// Decode an entry, resolving delta's as needed, while growing the `out` vector if there is not enough
149    /// space to hold the result object.
150    ///
151    /// The `entry` determines which object to decode, and is commonly obtained with the help of a pack index file or through pack iteration.
152    ///
153    /// `resolve` is a function to lookup objects with the given [`ObjectId`][git_hash::ObjectId], in case the full object id is used to refer to
154    /// a base object, instead of an in-pack offset.
155    ///
156    /// `delta_cache` is a mechanism to avoid looking up base objects multiple times when decompressing multiple objects in a row.
157    /// Use a [Noop-Cache][cache::Never] to disable caching all together at the cost of repeating work.
158    pub fn decode_entry(
159        &self,
160        entry: data::Entry,
161        out: &mut Vec<u8>,
162        resolve: impl Fn(&git_hash::oid, &mut Vec<u8>) -> Option<ResolvedBase>,
163        delta_cache: &mut impl cache::DecodeEntry,
164    ) -> Result<Outcome, Error> {
165        use crate::data::entry::Header::*;
166        match entry.header {
167            Tree | Blob | Commit | Tag => {
168                out.resize(
169                    entry
170                        .decompressed_size
171                        .try_into()
172                        .expect("size representable by machine"),
173                    0,
174                );
175                self.decompress_entry(&entry, out.as_mut_slice()).map(|consumed_input| {
176                    Outcome::from_object_entry(
177                        entry.header.as_kind().expect("a non-delta entry"),
178                        &entry,
179                        consumed_input,
180                    )
181                })
182            }
183            OfsDelta { .. } | RefDelta { .. } => self.resolve_deltas(entry, resolve, out, delta_cache),
184        }
185    }
186
187    /// resolve: technically, this shouldn't ever be required as stored local packs don't refer to objects by id
188    /// that are outside of the pack. Unless, of course, the ref refers to an object within this pack, which means
189    /// it's very, very large as 20bytes are smaller than the corresponding MSB encoded number
190    fn resolve_deltas(
191        &self,
192        last: data::Entry,
193        resolve: impl Fn(&git_hash::oid, &mut Vec<u8>) -> Option<ResolvedBase>,
194        out: &mut Vec<u8>,
195        cache: &mut impl cache::DecodeEntry,
196    ) -> Result<Outcome, Error> {
197        // all deltas, from the one that produces the desired object (first) to the oldest at the end of the chain
198        let mut chain = SmallVec::<[Delta; 10]>::default();
199        let first_entry = last.clone();
200        let mut cursor = last;
201        let mut base_buffer_size: Option<usize> = None;
202        let mut object_kind: Option<git_object::Kind> = None;
203        let mut consumed_input: Option<usize> = None;
204
205        // Find the first full base, either an undeltified object in the pack or a reference to another object.
206        let mut total_delta_data_size: u64 = 0;
207        while cursor.header.is_delta() {
208            if let Some((kind, packed_size)) = cache.get(self.id, cursor.data_offset, out) {
209                base_buffer_size = Some(out.len());
210                object_kind = Some(kind);
211                // If the input entry is a cache hit, keep the packed size as it must be returned.
212                // Otherwise, the packed size will be determined later when decompressing the input delta
213                if total_delta_data_size == 0 {
214                    consumed_input = Some(packed_size);
215                }
216                break;
217            }
218            total_delta_data_size += cursor.decompressed_size;
219            let decompressed_size = cursor
220                .decompressed_size
221                .try_into()
222                .expect("a single delta size small enough to fit a usize");
223            chain.push(Delta {
224                data: Range {
225                    start: 0,
226                    end: decompressed_size,
227                },
228                base_size: 0,
229                result_size: 0,
230                decompressed_size,
231                data_offset: cursor.data_offset,
232            });
233            use crate::data::entry::Header;
234            cursor = match cursor.header {
235                Header::OfsDelta { base_distance } => self.entry(cursor.base_pack_offset(base_distance)),
236                Header::RefDelta { base_id } => match resolve(base_id.as_ref(), out) {
237                    Some(ResolvedBase::InPack(entry)) => entry,
238                    Some(ResolvedBase::OutOfPack { end, kind }) => {
239                        base_buffer_size = Some(end);
240                        object_kind = Some(kind);
241                        break;
242                    }
243                    None => return Err(Error::DeltaBaseUnresolved(base_id)),
244                },
245                _ => unreachable!("cursor.is_delta() only allows deltas here"),
246            };
247        }
248
249        // This can happen if the cache held the first entry itself
250        // We will just treat it as an object then, even though it's technically incorrect.
251        if chain.is_empty() {
252            return Ok(Outcome::from_object_entry(
253                object_kind.expect("object kind as set by cache"),
254                &first_entry,
255                consumed_input.expect("consumed bytes as set by cache"),
256            ));
257        };
258
259        // First pass will decompress all delta data and keep it in our output buffer
260        // [<possibly resolved base object>]<delta-1..delta-n>...
261        // so that we can find the biggest result size.
262        let total_delta_data_size: usize = total_delta_data_size.try_into().expect("delta data to fit in memory");
263
264        let chain_len = chain.len();
265        let (first_buffer_end, second_buffer_end) = {
266            let delta_start = base_buffer_size.unwrap_or(0);
267            out.resize(delta_start + total_delta_data_size, 0);
268
269            let delta_range = Range {
270                start: delta_start,
271                end: delta_start + total_delta_data_size,
272            };
273            let mut instructions = &mut out[delta_range.clone()];
274            let mut relative_delta_start = 0;
275            let mut biggest_result_size = 0;
276            for (delta_idx, delta) in chain.iter_mut().rev().enumerate() {
277                let consumed_from_data_offset = self.decompress_entry_from_data_offset(
278                    delta.data_offset,
279                    &mut instructions[..delta.decompressed_size],
280                )?;
281                let is_last_delta_to_be_applied = delta_idx + 1 == chain_len;
282                if is_last_delta_to_be_applied {
283                    consumed_input = Some(consumed_from_data_offset);
284                }
285
286                let (base_size, offset) = delta::decode_header_size(instructions);
287                let mut bytes_consumed_by_header = offset;
288                biggest_result_size = biggest_result_size.max(base_size);
289                delta.base_size = base_size.try_into().expect("base size fits into usize");
290
291                let (result_size, offset) = delta::decode_header_size(&instructions[offset..]);
292                bytes_consumed_by_header += offset;
293                biggest_result_size = biggest_result_size.max(result_size);
294                delta.result_size = result_size.try_into().expect("result size fits into usize");
295
296                // the absolute location into the instructions buffer, so we keep track of the end point of the last
297                delta.data.start = relative_delta_start + bytes_consumed_by_header;
298                relative_delta_start += delta.decompressed_size;
299                delta.data.end = relative_delta_start;
300
301                instructions = &mut instructions[delta.decompressed_size..];
302            }
303
304            // Now we can produce a buffer like this
305            // [<biggest-result-buffer, possibly filled with resolved base object data>]<biggest-result-buffer><delta-1..delta-n>
306            // from [<possibly resolved base object>]<delta-1..delta-n>...
307            let biggest_result_size: usize = biggest_result_size
308                .try_into()
309                .expect("biggest result size small enough to fit into usize");
310            let first_buffer_size = biggest_result_size;
311            let second_buffer_size = first_buffer_size;
312            out.resize(first_buffer_size + second_buffer_size + total_delta_data_size, 0);
313
314            // Now 'rescue' the deltas, because in the next step we possibly overwrite that portion
315            // of memory with the base object (in the majority of cases)
316            let second_buffer_end = {
317                let end = first_buffer_size + second_buffer_size;
318                if delta_range.start < end {
319                    // …this means that the delta size is even larger than two uncompressed worst-case
320                    // intermediate results combined. It would already be undesirable to have it bigger
321                    // then the target size (as you could just store the object in whole).
322                    // However, this just means that it reuses existing deltas smartly, which as we rightfully
323                    // remember stand for an object each. However, this means a lot of data is read to restore
324                    // a single object sometimes. Fair enough - package size is minimized that way.
325                    out.copy_within(delta_range, end);
326                } else {
327                    let (buffers, instructions) = out.split_at_mut(end);
328                    instructions.copy_from_slice(&buffers[delta_range]);
329                }
330                end
331            };
332
333            // If we don't have a out-of-pack object already, fill the base-buffer by decompressing the full object
334            // at which the cursor is left after the iteration
335            if base_buffer_size.is_none() {
336                let base_entry = cursor;
337                debug_assert!(!base_entry.header.is_delta());
338                object_kind = base_entry.header.as_kind();
339                self.decompress_entry_from_data_offset(base_entry.data_offset, out)?;
340            }
341
342            (first_buffer_size, second_buffer_end)
343        };
344
345        // From oldest to most recent, apply all deltas, swapping the buffer back and forth
346        // TODO: once we have more tests, we could optimize this memory-intensive work to
347        //       analyse the delta-chains to only copy data once - after all, with 'copy-from-base' deltas,
348        //       all data originates from one base at some point.
349        // `out` is: [source-buffer][target-buffer][max-delta-instructions-buffer]
350        let (buffers, instructions) = out.split_at_mut(second_buffer_end);
351        let (mut source_buf, mut target_buf) = buffers.split_at_mut(first_buffer_end);
352
353        let mut last_result_size = None;
354        for (
355            delta_idx,
356            Delta {
357                data,
358                base_size,
359                result_size,
360                ..
361            },
362        ) in chain.into_iter().rev().enumerate()
363        {
364            let data = &mut instructions[data];
365            if delta_idx + 1 == chain_len {
366                last_result_size = Some(result_size);
367            }
368            delta::apply(&source_buf[..base_size], &mut target_buf[..result_size], data);
369            // use the target as source for the next delta
370            std::mem::swap(&mut source_buf, &mut target_buf);
371        }
372
373        let last_result_size = last_result_size.expect("at least one delta chain item");
374        // uneven chains leave the target buffer after the source buffer
375        // FIXME(Performance) If delta-chains are uneven, we know we will have to copy bytes over here
376        //      Instead we could use a different start buffer, to naturally end up with the result in the
377        //      right one.
378        //      However, this is a bit more complicated than just that - you have to deal with the base
379        //      object, which should also be placed in the second buffer right away. You don't have that
380        //      control/knowledge for out-of-pack bases, so this is a special case to deal with, too.
381        //      Maybe these invariants can be represented in the type system though.
382        if chain_len % 2 == 1 {
383            // this seems inverted, but remember: we swapped the buffers on the last iteration
384            target_buf[..last_result_size].copy_from_slice(&source_buf[..last_result_size]);
385        }
386        out.resize(last_result_size, 0);
387
388        let object_kind = object_kind.expect("a base object as root of any delta chain that we are here to resolve");
389        let consumed_input = consumed_input.expect("at least one decompressed delta object");
390        cache.put(
391            self.id,
392            first_entry.data_offset,
393            out.as_slice(),
394            object_kind,
395            consumed_input,
396        );
397        Ok(Outcome {
398            kind: object_kind,
399            // technically depending on the cache, the chain size is not correct as it might
400            // have been cut short by a cache hit. The caller must deactivate the cache to get
401            // actual results
402            num_deltas: chain_len as u32,
403            decompressed_size: first_entry.decompressed_size,
404            compressed_size: consumed_input,
405            object_size: last_result_size as u64,
406        })
407    }
408}
409
410#[cfg(test)]
411mod tests {
412    use super::*;
413
414    #[test]
415    fn size_of_decode_entry_outcome() {
416        assert_eq!(
417            std::mem::size_of::<Outcome>(),
418            32,
419            "this shouldn't change without use noticing as it's returned a lot"
420        );
421    }
422}