git_index/decode/
mod.rs

1use filetime::FileTime;
2
3use crate::{entry, extension, Entry, State, Version};
4
5mod entries;
6///
7pub mod header;
8
9mod error {
10
11    use crate::{decode, extension};
12
13    /// The error returned by [State::from_bytes()][crate::State::from_bytes()].
14    #[derive(Debug, thiserror::Error)]
15    #[allow(missing_docs)]
16    pub enum Error {
17        #[error(transparent)]
18        Header(#[from] decode::header::Error),
19        #[error("Could not parse entry at index {index}")]
20        Entry { index: u32 },
21        #[error("Mandatory extension wasn't implemented or malformed.")]
22        Extension(#[from] extension::decode::Error),
23        #[error("Index trailer should have been {expected} bytes long, but was {actual}")]
24        UnexpectedTrailerLength { expected: usize, actual: usize },
25        #[error("Shared index checksum was {actual_checksum} but should have been {expected_checksum}")]
26        ChecksumMismatch {
27            actual_checksum: git_hash::ObjectId,
28            expected_checksum: git_hash::ObjectId,
29        },
30    }
31}
32pub use error::Error;
33use git_features::parallel::InOrderIter;
34
35use crate::util::read_u32;
36
37/// Options to define how to decode an index state [from bytes][State::from_bytes()].
38#[derive(Default, Clone, Copy)]
39pub struct Options {
40    /// If Some(_), we are allowed to use more than one thread. If Some(N), use no more than N threads. If Some(0)|None, use as many threads
41    /// as there are logical cores.
42    ///
43    /// This applies to loading extensions in parallel to entries if the common EOIE extension is available.
44    /// It also allows to use multiple threads for loading entries if the IEOT extension is present.
45    pub thread_limit: Option<usize>,
46    /// The minimum size in bytes to load extensions in their own thread, assuming there is enough `num_threads` available.
47    /// If set to 0, for example, extensions will always be read in their own thread if enough threads are available.
48    pub min_extension_block_in_bytes_for_threading: usize,
49    /// Set the expected hash of this index if we are read as part of a `link` extension.
50    ///
51    /// We will abort reading this file if it doesn't match.
52    pub expected_checksum: Option<git_hash::ObjectId>,
53}
54
55impl State {
56    /// Decode an index state from `data` and store `timestamp` in the resulting instance for pass-through, assuming `object_hash`
57    /// to be used through the file.
58    pub fn from_bytes(
59        data: &[u8],
60        timestamp: FileTime,
61        object_hash: git_hash::Kind,
62        Options {
63            thread_limit,
64            min_extension_block_in_bytes_for_threading,
65            expected_checksum,
66        }: Options,
67    ) -> Result<(Self, git_hash::ObjectId), Error> {
68        let (version, num_entries, post_header_data) = header::decode(data, object_hash)?;
69        let start_of_extensions = extension::end_of_index_entry::decode(data, object_hash);
70
71        let mut num_threads = git_features::parallel::num_threads(thread_limit);
72        let path_backing_buffer_size = entries::estimate_path_storage_requirements_in_bytes(
73            num_entries,
74            data.len(),
75            start_of_extensions,
76            object_hash,
77            version,
78        );
79
80        let (entries, ext, data) = match start_of_extensions {
81            Some(offset) if num_threads > 1 => {
82                let extensions_data = &data[offset..];
83                let index_offsets_table = extension::index_entry_offset_table::find(extensions_data, object_hash);
84                let (entries_res, ext_res) = git_features::parallel::threads(|scope| {
85                    let extension_loading =
86                        (extensions_data.len() > min_extension_block_in_bytes_for_threading).then({
87                            num_threads -= 1;
88                            || {
89                                scope
90                                    .builder()
91                                    .name("git-index.from_bytes.load-extensions".into())
92                                    .spawn(|_| extension::decode::all(extensions_data, object_hash))
93                                    .expect("valid name")
94                            }
95                        });
96                    let entries_res = match index_offsets_table {
97                        Some(entry_offsets) => {
98                            let chunk_size = (entry_offsets.len() as f32 / num_threads as f32).ceil() as usize;
99                            let num_chunks = entry_offsets.chunks(chunk_size).count();
100                            let mut threads = Vec::with_capacity(num_chunks);
101                            for (id, chunks) in entry_offsets.chunks(chunk_size).enumerate() {
102                                let chunks = chunks.to_vec();
103                                threads.push(
104                                    scope
105                                        .builder()
106                                        .name(format!("git-index.from_bytes.read-entries.{id}"))
107                                        .spawn(move |_| {
108                                            let num_entries_for_chunks =
109                                                chunks.iter().map(|c| c.num_entries).sum::<u32>() as usize;
110                                            let mut entries = Vec::with_capacity(num_entries_for_chunks);
111                                            let path_backing_buffer_size_for_chunks =
112                                                entries::estimate_path_storage_requirements_in_bytes(
113                                                    num_entries_for_chunks as u32,
114                                                    data.len() / num_chunks,
115                                                    start_of_extensions.map(|ofs| ofs / num_chunks),
116                                                    object_hash,
117                                                    version,
118                                                );
119                                            let mut path_backing =
120                                                Vec::with_capacity(path_backing_buffer_size_for_chunks);
121                                            let mut is_sparse = false;
122                                            for offset in chunks {
123                                                let (
124                                                    entries::Outcome {
125                                                        is_sparse: chunk_is_sparse,
126                                                    },
127                                                    _data,
128                                                ) = entries::chunk(
129                                                    &data[offset.from_beginning_of_file as usize..],
130                                                    &mut entries,
131                                                    &mut path_backing,
132                                                    offset.num_entries,
133                                                    object_hash,
134                                                    version,
135                                                )?;
136                                                is_sparse |= chunk_is_sparse;
137                                            }
138                                            Ok::<_, Error>((
139                                                id,
140                                                EntriesOutcome {
141                                                    entries,
142                                                    path_backing,
143                                                    is_sparse,
144                                                },
145                                            ))
146                                        })
147                                        .expect("valid name"),
148                                );
149                            }
150                            let mut results =
151                                InOrderIter::from(threads.into_iter().map(|thread| thread.join().unwrap()));
152                            let mut acc = results.next().expect("have at least two results, one per thread");
153                            // We explicitly don't adjust the reserve in acc and rather allow for more copying
154                            // to happens as vectors grow to keep the peak memory size low.
155                            // NOTE: one day, we might use a memory pool for paths. We could encode the block of memory
156                            //       in some bytes in the path offset. That way there is more indirection/slower access
157                            //       to the path, but it would save time here.
158                            //       As it stands, `git` is definitely more efficient at this and probably uses less memory too.
159                            //       Maybe benchmarks can tell if that is noticeable later at 200/400GB/s memory bandwidth, or maybe just
160                            //       100GB/s on a single core.
161                            while let (Ok(lhs), Some(res)) = (acc.as_mut(), results.next()) {
162                                match res {
163                                    Ok(rhs) => {
164                                        lhs.is_sparse |= rhs.is_sparse;
165                                        let ofs = lhs.path_backing.len();
166                                        lhs.path_backing.extend(rhs.path_backing);
167                                        lhs.entries.extend(rhs.entries.into_iter().map(|mut e| {
168                                            e.path.start += ofs;
169                                            e.path.end += ofs;
170                                            e
171                                        }));
172                                    }
173                                    Err(err) => {
174                                        acc = Err(err);
175                                    }
176                                }
177                            }
178                            acc.map(|acc| (acc, &data[data.len() - object_hash.len_in_bytes()..]))
179                        }
180                        None => entries(
181                            post_header_data,
182                            path_backing_buffer_size,
183                            num_entries,
184                            object_hash,
185                            version,
186                        ),
187                    };
188                    let ext_res = extension_loading
189                        .map(|thread| thread.join().unwrap())
190                        .unwrap_or_else(|| extension::decode::all(extensions_data, object_hash));
191                    (entries_res, ext_res)
192                })
193                .unwrap(); // this unwrap is for panics - if these happened we are done anyway.
194                let (ext, data) = ext_res?;
195                (entries_res?.0, ext, data)
196            }
197            None | Some(_) => {
198                let (entries, data) = entries(
199                    post_header_data,
200                    path_backing_buffer_size,
201                    num_entries,
202                    object_hash,
203                    version,
204                )?;
205                let (ext, data) = extension::decode::all(data, object_hash)?;
206                (entries, ext, data)
207            }
208        };
209
210        if data.len() != object_hash.len_in_bytes() {
211            return Err(Error::UnexpectedTrailerLength {
212                expected: object_hash.len_in_bytes(),
213                actual: data.len(),
214            });
215        }
216
217        let checksum = git_hash::ObjectId::from(data);
218        if let Some(expected_checksum) = expected_checksum {
219            if checksum != expected_checksum {
220                return Err(Error::ChecksumMismatch {
221                    actual_checksum: checksum,
222                    expected_checksum,
223                });
224            }
225        }
226        let EntriesOutcome {
227            entries,
228            path_backing,
229            mut is_sparse,
230        } = entries;
231        let extension::decode::Outcome {
232            tree,
233            link,
234            resolve_undo,
235            untracked,
236            fs_monitor,
237            is_sparse: is_sparse_from_ext, // a marker is needed in case there are no directories
238        } = ext;
239        is_sparse |= is_sparse_from_ext;
240
241        Ok((
242            State {
243                object_hash,
244                timestamp,
245                version,
246                entries,
247                path_backing,
248                is_sparse,
249
250                tree,
251                link,
252                resolve_undo,
253                untracked,
254                fs_monitor,
255            },
256            checksum,
257        ))
258    }
259}
260
261struct EntriesOutcome {
262    pub entries: Vec<Entry>,
263    pub path_backing: Vec<u8>,
264    pub is_sparse: bool,
265}
266
267fn entries(
268    post_header_data: &[u8],
269    path_backing_buffer_size: usize,
270    num_entries: u32,
271    object_hash: git_hash::Kind,
272    version: Version,
273) -> Result<(EntriesOutcome, &[u8]), Error> {
274    let mut entries = Vec::with_capacity(num_entries as usize);
275    let mut path_backing = Vec::with_capacity(path_backing_buffer_size);
276    entries::chunk(
277        post_header_data,
278        &mut entries,
279        &mut path_backing,
280        num_entries,
281        object_hash,
282        version,
283    )
284    .map(|(entries::Outcome { is_sparse }, data): (entries::Outcome, &[u8])| {
285        (
286            EntriesOutcome {
287                entries,
288                path_backing,
289                is_sparse,
290            },
291            data,
292        )
293    })
294}
295
296pub(crate) fn stat(data: &[u8]) -> Option<(entry::Stat, &[u8])> {
297    let (ctime_secs, data) = read_u32(data)?;
298    let (ctime_nsecs, data) = read_u32(data)?;
299    let (mtime_secs, data) = read_u32(data)?;
300    let (mtime_nsecs, data) = read_u32(data)?;
301    let (dev, data) = read_u32(data)?;
302    let (ino, data) = read_u32(data)?;
303    let (uid, data) = read_u32(data)?;
304    let (gid, data) = read_u32(data)?;
305    let (size, data) = read_u32(data)?;
306    Some((
307        entry::Stat {
308            mtime: entry::Time {
309                secs: ctime_secs,
310                nsecs: ctime_nsecs,
311            },
312            ctime: entry::Time {
313                secs: mtime_secs,
314                nsecs: mtime_nsecs,
315            },
316            dev,
317            ino,
318            uid,
319            gid,
320            size,
321        },
322        data,
323    ))
324}