git_pack/index/write/
mod.rs

1use std::{convert::TryInto, io, sync::atomic::AtomicBool};
2
3pub use error::Error;
4use git_features::progress::{self, Progress};
5
6use crate::cache::delta::{traverse, Tree};
7
8pub(crate) mod encode;
9mod error;
10
11pub(crate) struct TreeEntry {
12    pub id: git_hash::ObjectId,
13    pub crc32: u32,
14}
15
16/// Information gathered while executing [`write_data_iter_to_stream()`][crate::index::File::write_data_iter_to_stream]
17#[derive(PartialEq, Eq, Debug, Hash, Ord, PartialOrd, Clone)]
18#[cfg_attr(feature = "serde1", derive(serde::Serialize, serde::Deserialize))]
19pub struct Outcome {
20    /// The version of the verified index
21    pub index_version: crate::index::Version,
22    /// The verified checksum of the verified index
23    pub index_hash: git_hash::ObjectId,
24
25    /// The hash of the '.pack' file, also found in its trailing bytes
26    pub data_hash: git_hash::ObjectId,
27    /// The amount of objects that were verified, always the amount of objects in the pack.
28    pub num_objects: u32,
29}
30
31/// The progress ids used in [`write_data_iter_from_stream()`][crate::index::File::write_data_iter_to_stream()].
32///
33/// Use this information to selectively extract the progress of interest in case the parent application has custom visualization.
34#[derive(Debug, Copy, Clone)]
35pub enum ProgressId {
36    /// Counts the amount of objects that were index thus far.
37    IndexObjects,
38    /// The amount of bytes that were decompressed while decoding pack entries.
39    ///
40    /// This is done to determine entry boundaries.
41    DecompressedBytes,
42    /// The amount of objects whose hashes were computed.
43    ///
44    /// This is done by decoding them, which typically involves decoding delta objects.
45    ResolveObjects,
46    /// The amount of bytes that were decoded in total, as the sum of all bytes to represent all resolved objects.
47    DecodedBytes,
48    /// The amount of bytes written to the index file.
49    IndexBytesWritten,
50}
51
52impl From<ProgressId> for git_features::progress::Id {
53    fn from(v: ProgressId) -> Self {
54        match v {
55            ProgressId::IndexObjects => *b"IWIO",
56            ProgressId::DecompressedBytes => *b"IWDB",
57            ProgressId::ResolveObjects => *b"IWRO",
58            ProgressId::DecodedBytes => *b"IWDB",
59            ProgressId::IndexBytesWritten => *b"IWBW",
60        }
61    }
62}
63
64/// Various ways of writing an index file from pack entries
65impl crate::index::File {
66    /// Write information about `entries` as obtained from a pack data file into a pack index file via the `out` stream.
67    /// The resolver produced by `make_resolver` must resolve pack entries from the same pack data file that produced the
68    /// `entries` iterator.
69    ///
70    /// * `kind` is the version of pack index to produce, use [`crate::index::Version::default()`] if in doubt.
71    /// * `tread_limit` is used for a parallel tree traversal for obtaining object hashes with optimal performance.
72    /// * `root_progress` is the top-level progress to stay informed about the progress of this potentially long-running
73    ///    computation.
74    /// * `object_hash` defines what kind of object hash we write into the index file.
75    /// * `pack_version` is the version of the underlying pack for which `entries` are read. It's used in case none of these objects are provided
76    ///    to compute a pack-hash.
77    ///
78    /// # Remarks
79    ///
80    /// * neither in-pack nor out-of-pack Ref Deltas are supported here, these must have been resolved beforehand.
81    /// * `make_resolver()` will only be called after the iterator stopped returning elements and produces a function that
82    /// provides all bytes belonging to a pack entry writing them to the given mutable output `Vec`.
83    /// It should return `None` if the entry cannot be resolved from the pack that produced the `entries` iterator, causing
84    /// the write operation to fail.
85    #[allow(clippy::too_many_arguments)]
86    pub fn write_data_iter_to_stream<F, F2>(
87        version: crate::index::Version,
88        make_resolver: F,
89        entries: impl Iterator<Item = Result<crate::data::input::Entry, crate::data::input::Error>>,
90        thread_limit: Option<usize>,
91        mut root_progress: impl Progress,
92        out: impl io::Write,
93        should_interrupt: &AtomicBool,
94        object_hash: git_hash::Kind,
95        pack_version: crate::data::Version,
96    ) -> Result<Outcome, Error>
97    where
98        F: FnOnce() -> io::Result<F2>,
99        F2: for<'r> Fn(crate::data::EntryRange, &'r mut Vec<u8>) -> Option<()> + Send + Clone,
100    {
101        if version != crate::index::Version::default() {
102            return Err(Error::Unsupported(version));
103        }
104        let mut num_objects: usize = 0;
105        let mut last_seen_trailer = None;
106        let anticipated_num_objects = entries.size_hint().1.unwrap_or_else(|| entries.size_hint().0);
107        let mut tree = Tree::with_capacity(anticipated_num_objects)?;
108        let indexing_start = std::time::Instant::now();
109
110        root_progress.init(Some(4), progress::steps());
111        let mut objects_progress = root_progress.add_child_with_id("indexing", ProgressId::IndexObjects.into());
112        objects_progress.init(entries.size_hint().1, progress::count("objects"));
113        let mut decompressed_progress =
114            root_progress.add_child_with_id("decompressing", ProgressId::DecompressedBytes.into());
115        decompressed_progress.init(None, progress::bytes());
116        let mut pack_entries_end: u64 = 0;
117
118        for entry in entries {
119            let crate::data::input::Entry {
120                header,
121                pack_offset,
122                crc32,
123                header_size,
124                compressed: _,
125                compressed_size,
126                decompressed_size,
127                trailer,
128            } = entry?;
129
130            decompressed_progress.inc_by(decompressed_size as usize);
131
132            let entry_len = header_size as u64 + compressed_size;
133            pack_entries_end = pack_offset + entry_len;
134
135            let crc32 = crc32.expect("crc32 to be computed by the iterator. Caller assures correct configuration.");
136
137            use crate::data::entry::Header::*;
138            match header {
139                Tree | Blob | Commit | Tag => {
140                    tree.add_root(
141                        pack_offset,
142                        TreeEntry {
143                            id: object_hash.null(),
144                            crc32,
145                        },
146                    )?;
147                }
148                RefDelta { .. } => return Err(Error::IteratorInvariantNoRefDelta),
149                OfsDelta { base_distance } => {
150                    let base_pack_offset =
151                        crate::data::entry::Header::verified_base_pack_offset(pack_offset, base_distance).ok_or(
152                            Error::IteratorInvariantBaseOffset {
153                                pack_offset,
154                                distance: base_distance,
155                            },
156                        )?;
157                    tree.add_child(
158                        base_pack_offset,
159                        pack_offset,
160                        TreeEntry {
161                            id: object_hash.null(),
162                            crc32,
163                        },
164                    )?;
165                }
166            };
167            last_seen_trailer = trailer;
168            num_objects += 1;
169            objects_progress.inc();
170        }
171        if num_objects != anticipated_num_objects {
172            objects_progress.info(format!(
173                "{anticipated_num_objects} objects were resolved into {num_objects} objects during thin-pack resolution"
174            ));
175        }
176        let num_objects: u32 = num_objects
177            .try_into()
178            .map_err(|_| Error::IteratorInvariantTooManyObjects(num_objects))?;
179
180        objects_progress.show_throughput(indexing_start);
181        decompressed_progress.show_throughput(indexing_start);
182        drop(objects_progress);
183        drop(decompressed_progress);
184
185        root_progress.inc();
186
187        let resolver = make_resolver()?;
188        let sorted_pack_offsets_by_oid = {
189            let traverse::Outcome { roots, children } = tree.traverse(
190                resolver,
191                pack_entries_end,
192                || (),
193                |data,
194                 _progress,
195                 traverse::Context {
196                     entry,
197                     decompressed: bytes,
198                     ..
199                 }| {
200                    modify_base(data, entry, bytes, version.hash());
201                    Ok::<_, Error>(())
202                },
203                traverse::Options {
204                    object_progress: root_progress.add_child_with_id("Resolving", ProgressId::ResolveObjects.into()),
205                    size_progress: root_progress.add_child_with_id("Decoding", ProgressId::DecodedBytes.into()),
206                    thread_limit,
207                    should_interrupt,
208                    object_hash,
209                },
210            )?;
211            root_progress.inc();
212
213            let mut items = roots;
214            items.extend(children);
215            {
216                let _progress = root_progress.add_child_with_id("sorting by id", git_features::progress::UNKNOWN);
217                items.sort_by_key(|e| e.data.id);
218            }
219
220            root_progress.inc();
221            items
222        };
223
224        let pack_hash = match last_seen_trailer {
225            Some(ph) => ph,
226            None if num_objects == 0 => {
227                let header = crate::data::header::encode(pack_version, 0);
228                let mut hasher = git_features::hash::hasher(object_hash);
229                hasher.update(&header);
230                git_hash::ObjectId::from(hasher.digest())
231            }
232            None => return Err(Error::IteratorInvariantTrailer),
233        };
234        let index_hash = encode::write_to(
235            out,
236            sorted_pack_offsets_by_oid,
237            &pack_hash,
238            version,
239            root_progress.add_child_with_id("writing index file", ProgressId::IndexBytesWritten.into()),
240        )?;
241        root_progress.show_throughput_with(
242            indexing_start,
243            num_objects as usize,
244            progress::count("objects").expect("unit always set"),
245            progress::MessageLevel::Success,
246        );
247        Ok(Outcome {
248            index_version: version,
249            index_hash,
250            data_hash: pack_hash,
251            num_objects,
252        })
253    }
254}
255
256fn modify_base(entry: &mut TreeEntry, pack_entry: &crate::data::Entry, decompressed: &[u8], hash: git_hash::Kind) {
257    fn compute_hash(kind: git_object::Kind, bytes: &[u8], object_hash: git_hash::Kind) -> git_hash::ObjectId {
258        let mut hasher = git_features::hash::hasher(object_hash);
259        hasher.update(&git_object::encode::loose_header(kind, bytes.len()));
260        hasher.update(bytes);
261        git_hash::ObjectId::from(hasher.digest())
262    }
263
264    let object_kind = pack_entry.header.as_kind().expect("base object as source of iteration");
265    let id = compute_hash(object_kind, decompressed, hash);
266    entry.id = id;
267}