gix_index/decode/
mod.rs

1use filetime::FileTime;
2
3use crate::{entry, extension, Entry, State, Version};
4
5mod entries;
6///
7pub mod header;
8
9mod error {
10
11    use crate::{decode, extension};
12
13    /// The error returned by [`State::from_bytes()`][crate::State::from_bytes()].
14    #[derive(Debug, thiserror::Error)]
15    #[allow(missing_docs)]
16    pub enum Error {
17        #[error(transparent)]
18        Header(#[from] decode::header::Error),
19        #[error("Could not hash index data")]
20        Hasher(#[from] gix_hash::hasher::Error),
21        #[error("Could not parse entry at index {index}")]
22        Entry { index: u32 },
23        #[error("Mandatory extension wasn't implemented or malformed.")]
24        Extension(#[from] extension::decode::Error),
25        #[error("Index trailer should have been {expected} bytes long, but was {actual}")]
26        UnexpectedTrailerLength { expected: usize, actual: usize },
27        #[error("Shared index checksum mismatch")]
28        Verify(#[from] gix_hash::verify::Error),
29    }
30}
31pub use error::Error;
32use gix_features::parallel::InOrderIter;
33
34use crate::util::read_u32;
35
36/// Options to define how to decode an index state [from bytes][State::from_bytes()].
37#[derive(Debug, Default, Clone, Copy)]
38pub struct Options {
39    /// If Some(_), we are allowed to use more than one thread. If Some(N), use no more than N threads. If Some(0)|None, use as many threads
40    /// as there are logical cores.
41    ///
42    /// This applies to loading extensions in parallel to entries if the common EOIE extension is available.
43    /// It also allows to use multiple threads for loading entries if the IEOT extension is present.
44    pub thread_limit: Option<usize>,
45    /// The minimum size in bytes to load extensions in their own thread, assuming there is enough `num_threads` available.
46    /// If set to 0, for example, extensions will always be read in their own thread if enough threads are available.
47    pub min_extension_block_in_bytes_for_threading: usize,
48    /// Set the expected hash of this index if we are read as part of a `link` extension.
49    ///
50    /// We will abort reading this file if it doesn't match.
51    pub expected_checksum: Option<gix_hash::ObjectId>,
52}
53
54impl State {
55    /// Decode an index state from `data` and store `timestamp` in the resulting instance for pass-through, assuming `object_hash`
56    /// to be used through the file. Also return the stored hash over all bytes in `data` or `None` if none was written due to `index.skipHash`.
57    pub fn from_bytes(
58        data: &[u8],
59        timestamp: FileTime,
60        object_hash: gix_hash::Kind,
61        _options @ Options {
62            thread_limit,
63            min_extension_block_in_bytes_for_threading,
64            expected_checksum,
65        }: Options,
66    ) -> Result<(Self, Option<gix_hash::ObjectId>), Error> {
67        let _span = gix_features::trace::detail!("gix_index::State::from_bytes()", options = ?_options);
68        let (version, num_entries, post_header_data) = header::decode(data, object_hash)?;
69        let start_of_extensions = extension::end_of_index_entry::decode(data, object_hash)?;
70
71        let mut num_threads = gix_features::parallel::num_threads(thread_limit);
72        let path_backing_buffer_size = entries::estimate_path_storage_requirements_in_bytes(
73            num_entries,
74            data.len(),
75            start_of_extensions,
76            object_hash,
77            version,
78        );
79
80        let (entries, ext, data) = match start_of_extensions {
81            Some(offset) if num_threads > 1 => {
82                let extensions_data = &data[offset..];
83                let index_offsets_table = extension::index_entry_offset_table::find(extensions_data, object_hash);
84                let (entries_res, ext_res) = gix_features::parallel::threads(|scope| {
85                    let extension_loading =
86                        (extensions_data.len() > min_extension_block_in_bytes_for_threading).then({
87                            num_threads -= 1;
88                            || {
89                                gix_features::parallel::build_thread()
90                                    .name("gix-index.from_bytes.load-extensions".into())
91                                    .spawn_scoped(scope, || extension::decode::all(extensions_data, object_hash))
92                                    .expect("valid name")
93                            }
94                        });
95                    let entries_res = match index_offsets_table {
96                        Some(entry_offsets) => {
97                            let chunk_size = (entry_offsets.len() as f32 / num_threads as f32).ceil() as usize;
98                            let entry_offsets_chunked = entry_offsets.chunks(chunk_size);
99                            let num_chunks = entry_offsets_chunked.len();
100                            let mut threads = Vec::with_capacity(num_chunks);
101                            for (id, chunks) in entry_offsets_chunked.enumerate() {
102                                let chunks = chunks.to_vec();
103                                threads.push(
104                                    gix_features::parallel::build_thread()
105                                        .name(format!("gix-index.from_bytes.read-entries.{id}"))
106                                        .spawn_scoped(scope, move || {
107                                            let num_entries_for_chunks =
108                                                chunks.iter().map(|c| c.num_entries).sum::<u32>() as usize;
109                                            let mut entries = Vec::with_capacity(num_entries_for_chunks);
110                                            let path_backing_buffer_size_for_chunks =
111                                                entries::estimate_path_storage_requirements_in_bytes(
112                                                    num_entries_for_chunks as u32,
113                                                    data.len() / num_chunks,
114                                                    start_of_extensions.map(|ofs| ofs / num_chunks),
115                                                    object_hash,
116                                                    version,
117                                                );
118                                            let mut path_backing =
119                                                Vec::with_capacity(path_backing_buffer_size_for_chunks);
120                                            let mut is_sparse = false;
121                                            for offset in chunks {
122                                                let (
123                                                    entries::Outcome {
124                                                        is_sparse: chunk_is_sparse,
125                                                    },
126                                                    _data,
127                                                ) = entries::chunk(
128                                                    &data[offset.from_beginning_of_file as usize..],
129                                                    &mut entries,
130                                                    &mut path_backing,
131                                                    offset.num_entries,
132                                                    object_hash,
133                                                    version,
134                                                )?;
135                                                is_sparse |= chunk_is_sparse;
136                                            }
137                                            Ok::<_, Error>((
138                                                id,
139                                                EntriesOutcome {
140                                                    entries,
141                                                    path_backing,
142                                                    is_sparse,
143                                                },
144                                            ))
145                                        })
146                                        .expect("valid name"),
147                                );
148                            }
149                            let mut results =
150                                InOrderIter::from(threads.into_iter().map(|thread| thread.join().unwrap()));
151                            let mut acc = results.next().expect("have at least two results, one per thread");
152                            // We explicitly don't adjust the reserve in acc and rather allow for more copying
153                            // to happens as vectors grow to keep the peak memory size low.
154                            // NOTE: one day, we might use a memory pool for paths. We could encode the block of memory
155                            //       in some bytes in the path offset. That way there is more indirection/slower access
156                            //       to the path, but it would save time here.
157                            //       As it stands, `git` is definitely more efficient at this and probably uses less memory too.
158                            //       Maybe benchmarks can tell if that is noticeable later at 200/400GB/s memory bandwidth, or maybe just
159                            //       100GB/s on a single core.
160                            while let (Ok(lhs), Some(res)) = (acc.as_mut(), results.next()) {
161                                match res {
162                                    Ok(mut rhs) => {
163                                        lhs.is_sparse |= rhs.is_sparse;
164                                        let ofs = lhs.path_backing.len();
165                                        lhs.path_backing.append(&mut rhs.path_backing);
166                                        lhs.entries.extend(rhs.entries.into_iter().map(|mut e| {
167                                            e.path.start += ofs;
168                                            e.path.end += ofs;
169                                            e
170                                        }));
171                                    }
172                                    Err(err) => {
173                                        acc = Err(err);
174                                    }
175                                }
176                            }
177                            acc.map(|acc| (acc, &data[data.len() - object_hash.len_in_bytes()..]))
178                        }
179                        None => entries(
180                            post_header_data,
181                            path_backing_buffer_size,
182                            num_entries,
183                            object_hash,
184                            version,
185                        ),
186                    };
187                    let ext_res = extension_loading.map_or_else(
188                        || extension::decode::all(extensions_data, object_hash),
189                        |thread| thread.join().unwrap(),
190                    );
191                    (entries_res, ext_res)
192                });
193                let (ext, data) = ext_res?;
194                (entries_res?.0, ext, data)
195            }
196            None | Some(_) => {
197                let (entries, data) = entries(
198                    post_header_data,
199                    path_backing_buffer_size,
200                    num_entries,
201                    object_hash,
202                    version,
203                )?;
204                let (ext, data) = extension::decode::all(data, object_hash)?;
205                (entries, ext, data)
206            }
207        };
208
209        if data.len() != object_hash.len_in_bytes() {
210            return Err(Error::UnexpectedTrailerLength {
211                expected: object_hash.len_in_bytes(),
212                actual: data.len(),
213            });
214        }
215
216        let checksum = gix_hash::ObjectId::from_bytes_or_panic(data);
217        let checksum = (!checksum.is_null()).then_some(checksum);
218        if let Some((expected_checksum, actual_checksum)) = expected_checksum.zip(checksum) {
219            actual_checksum.verify(&expected_checksum)?;
220        }
221        let EntriesOutcome {
222            entries,
223            path_backing,
224            mut is_sparse,
225        } = entries;
226        let extension::decode::Outcome {
227            tree,
228            link,
229            resolve_undo,
230            untracked,
231            fs_monitor,
232            is_sparse: is_sparse_from_ext, // a marker is needed in case there are no directories
233            end_of_index,
234            offset_table,
235        } = ext;
236        is_sparse |= is_sparse_from_ext;
237
238        Ok((
239            State {
240                object_hash,
241                timestamp,
242                version,
243                entries,
244                path_backing,
245                is_sparse,
246
247                end_of_index_at_decode_time: end_of_index,
248                offset_table_at_decode_time: offset_table,
249                tree,
250                link,
251                resolve_undo,
252                untracked,
253                fs_monitor,
254            },
255            checksum,
256        ))
257    }
258}
259
260struct EntriesOutcome {
261    pub entries: Vec<Entry>,
262    pub path_backing: Vec<u8>,
263    pub is_sparse: bool,
264}
265
266fn entries(
267    post_header_data: &[u8],
268    path_backing_buffer_size: usize,
269    num_entries: u32,
270    object_hash: gix_hash::Kind,
271    version: Version,
272) -> Result<(EntriesOutcome, &[u8]), Error> {
273    let mut entries = Vec::with_capacity(num_entries as usize);
274    let mut path_backing = Vec::with_capacity(path_backing_buffer_size);
275    entries::chunk(
276        post_header_data,
277        &mut entries,
278        &mut path_backing,
279        num_entries,
280        object_hash,
281        version,
282    )
283    .map(|(entries::Outcome { is_sparse }, data): (entries::Outcome, &[u8])| {
284        (
285            EntriesOutcome {
286                entries,
287                path_backing,
288                is_sparse,
289            },
290            data,
291        )
292    })
293}
294
295pub(crate) fn stat(data: &[u8]) -> Option<(entry::Stat, &[u8])> {
296    let (ctime_secs, data) = read_u32(data)?;
297    let (ctime_nsecs, data) = read_u32(data)?;
298    let (mtime_secs, data) = read_u32(data)?;
299    let (mtime_nsecs, data) = read_u32(data)?;
300    let (dev, data) = read_u32(data)?;
301    let (ino, data) = read_u32(data)?;
302    let (uid, data) = read_u32(data)?;
303    let (gid, data) = read_u32(data)?;
304    let (size, data) = read_u32(data)?;
305    Some((
306        entry::Stat {
307            mtime: entry::stat::Time {
308                secs: ctime_secs,
309                nsecs: ctime_nsecs,
310            },
311            ctime: entry::stat::Time {
312                secs: mtime_secs,
313                nsecs: mtime_nsecs,
314            },
315            dev,
316            ino,
317            uid,
318            gid,
319            size,
320        },
321        data,
322    ))
323}