Skip to main content

gix_pack/index/traverse/
with_index.rs

1use std::sync::atomic::{AtomicBool, Ordering};
2
3use gix_features::{parallel, progress::DynNestedProgress};
4
5use super::Error;
6use crate::{
7    cache::delta::traverse,
8    index::{self, traverse::Outcome, util::index_entries_sorted_by_offset_ascending},
9};
10
11/// Traversal options for [`traverse_with_index()`][index::File::traverse_with_index()]
12#[derive(Default)]
13pub struct Options {
14    /// If `Some`, only use the given number of threads. Otherwise, the number of threads to use will be selected based on
15    /// the number of available logical cores.
16    pub thread_limit: Option<usize>,
17    /// The kinds of safety checks to perform.
18    pub check: crate::index::traverse::SafetyCheck,
19}
20
21/// The progress ids used in [`index::File::traverse_with_index()`].
22///
23/// Use this information to selectively extract the progress of interest in case the parent application has custom visualization.
24#[derive(Debug, Copy, Clone)]
25pub enum ProgressId {
26    /// The amount of bytes currently processed to generate a checksum of the *pack data file*.
27    HashPackDataBytes,
28    /// The amount of bytes currently processed to generate a checksum of the *pack index file*.
29    HashPackIndexBytes,
30    /// Collect all object hashes into a vector and sort it by their pack offset.
31    CollectSortedIndexEntries,
32    /// Count the objects processed when building a cache tree from all objects in a pack index.
33    TreeFromOffsetsObjects,
34    /// The amount of objects which were decoded.
35    DecodedObjects,
36    /// The amount of bytes that were decoded in total, as the sum of all bytes to represent all decoded objects.
37    DecodedBytes,
38}
39
40impl From<ProgressId> for gix_features::progress::Id {
41    fn from(v: ProgressId) -> Self {
42        match v {
43            ProgressId::HashPackDataBytes => *b"PTHP",
44            ProgressId::HashPackIndexBytes => *b"PTHI",
45            ProgressId::CollectSortedIndexEntries => *b"PTCE",
46            ProgressId::TreeFromOffsetsObjects => *b"PTDI",
47            ProgressId::DecodedObjects => *b"PTRO",
48            ProgressId::DecodedBytes => *b"PTDB",
49        }
50    }
51}
52
53/// Traversal with index
54impl<T> index::File<T>
55where
56    T: crate::FileData + Sync,
57{
58    /// Iterate through all _decoded objects_ in the given `pack` and handle them with a `Processor`, using an index to reduce waste
59    /// at the cost of memory.
60    ///
61    /// For more details, see the documentation on the [`traverse()`][index::File::traverse()] method.
62    pub fn traverse_with_index<Processor, E, D>(
63        &self,
64        pack: &crate::data::File<D>,
65        mut processor: Processor,
66        progress: &mut dyn DynNestedProgress,
67        should_interrupt: &AtomicBool,
68        Options { check, thread_limit }: Options,
69    ) -> Result<Outcome, Error<E>>
70    where
71        Processor: FnMut(gix_object::Kind, &[u8], &index::Entry, &dyn gix_features::progress::Progress) -> Result<(), E>
72            + Send
73            + Clone,
74        E: std::error::Error + Send + Sync + 'static,
75        D: crate::FileData + Send + Sync,
76    {
77        let (verify_result, traversal_result) = parallel::join(
78            {
79                let mut pack_progress = progress.add_child_with_id(
80                    format!("Hash of pack '{}'", crate::source_name(pack.path())),
81                    ProgressId::HashPackDataBytes.into(),
82                );
83                let mut index_progress = progress.add_child_with_id(
84                    format!("Hash of index '{}'", crate::source_name(&self.path)),
85                    ProgressId::HashPackIndexBytes.into(),
86                );
87                move || {
88                    let res =
89                        self.possibly_verify(pack, check, &mut pack_progress, &mut index_progress, should_interrupt);
90                    if res.is_err() {
91                        should_interrupt.store(true, Ordering::SeqCst);
92                    }
93                    res
94                }
95            },
96            || -> Result<_, Error<_>> {
97                let sorted_entries = index_entries_sorted_by_offset_ascending(
98                    self,
99                    &mut progress.add_child_with_id(
100                        "collecting sorted index".into(),
101                        ProgressId::CollectSortedIndexEntries.into(),
102                    ),
103                ); /* Pack Traverse Collect sorted Entries */
104                let tree = crate::cache::delta::Tree::from_offsets_in_pack(
105                    pack.path(),
106                    sorted_entries.into_iter().map(Entry::from),
107                    &|e| e.index_entry.pack_offset,
108                    &|id| self.lookup(id).map(|idx| self.pack_offset_at_index(idx)),
109                    &mut progress.add_child_with_id("indexing".into(), ProgressId::TreeFromOffsetsObjects.into()),
110                    should_interrupt,
111                    self.object_hash,
112                )?;
113                let mut outcome = digest_statistics(tree.traverse(
114                    |slice, pack| pack.entry_slice(slice),
115                    pack,
116                    pack.pack_end() as u64,
117                    move |data,
118                          progress,
119                          traverse::Context {
120                              entry: pack_entry,
121                              entry_end,
122                              decompressed: bytes,
123                              level,
124                          }| {
125                        let object_kind = pack_entry.header.as_kind().expect("non-delta object");
126                        data.level = level;
127                        data.decompressed_size = pack_entry.decompressed_size;
128                        data.object_kind = object_kind;
129                        data.compressed_size = entry_end - pack_entry.data_offset;
130                        data.object_size = bytes.len() as u64;
131                        let result = index::traverse::process_entry(
132                            check,
133                            object_kind,
134                            bytes,
135                            &data.index_entry,
136                            || {
137                                // TODO: Fix this - we overwrite the header of 'data' which also changes the computed entry size,
138                                // causing index and pack to seemingly mismatch. This is surprising, and should be done differently.
139                                // debug_assert_eq!(&data.index_entry.pack_offset, &pack_entry.pack_offset());
140                                gix_features::hash::crc32(
141                                    pack.entry_slice(data.index_entry.pack_offset..entry_end)
142                                        .expect("slice pointing into the pack (by now data is verified)"),
143                                )
144                            },
145                            progress,
146                            &mut processor,
147                        );
148                        match result {
149                            Err(err @ Error::PackDecode { .. }) if !check.fatal_decode_error() => {
150                                progress.info(format!("Ignoring decode error: {err}"));
151                                Ok(())
152                            }
153                            res => res,
154                        }
155                    },
156                    traverse::Options {
157                        object_progress: Box::new(
158                            progress.add_child_with_id("Resolving".into(), ProgressId::DecodedObjects.into()),
159                        ),
160                        size_progress:
161                            &mut progress.add_child_with_id("Decoding".into(), ProgressId::DecodedBytes.into()),
162                        thread_limit,
163                        should_interrupt,
164                        object_hash: self.object_hash,
165                    },
166                )?);
167                outcome.pack_size = pack.data_len() as u64;
168                Ok(outcome)
169            },
170        );
171        Ok(Outcome {
172            actual_index_checksum: verify_result?,
173            statistics: traversal_result?,
174        })
175    }
176}
177
178struct Entry {
179    index_entry: crate::index::Entry,
180    object_kind: gix_object::Kind,
181    object_size: u64,
182    decompressed_size: u64,
183    compressed_size: u64,
184    level: u16,
185}
186
187impl From<crate::index::Entry> for Entry {
188    fn from(index_entry: crate::index::Entry) -> Self {
189        Entry {
190            index_entry,
191            level: 0,
192            object_kind: gix_object::Kind::Tree,
193            object_size: 0,
194            decompressed_size: 0,
195            compressed_size: 0,
196        }
197    }
198}
199
200fn digest_statistics(traverse::Outcome { roots, children }: traverse::Outcome<Entry>) -> index::traverse::Statistics {
201    let mut res = index::traverse::Statistics::default();
202    let average = &mut res.average;
203    for item in roots.iter().chain(children.iter()) {
204        res.total_compressed_entries_size += item.data.compressed_size;
205        res.total_decompressed_entries_size += item.data.decompressed_size;
206        res.total_object_size += item.data.object_size;
207        *res.objects_per_chain_length
208            .entry(u32::from(item.data.level))
209            .or_insert(0) += 1;
210
211        average.decompressed_size += item.data.decompressed_size;
212        average.compressed_size += item.data.compressed_size as usize;
213        average.object_size += item.data.object_size;
214        average.num_deltas += u32::from(item.data.level);
215        use gix_object::Kind::*;
216        match item.data.object_kind {
217            Blob => res.num_blobs += 1,
218            Tree => res.num_trees += 1,
219            Tag => res.num_tags += 1,
220            Commit => res.num_commits += 1,
221        }
222    }
223
224    let num_nodes = roots.len() + children.len();
225    average.decompressed_size /= num_nodes as u64;
226    average.compressed_size /= num_nodes;
227    average.object_size /= num_nodes as u64;
228    average.num_deltas /= num_nodes as u32;
229
230    res
231}