git_pack/multi_index/
chunk.rs

1/// Information for the chunk about index names
2pub mod index_names {
3    use std::path::{Path, PathBuf};
4
5    use git_object::bstr::{BString, ByteSlice};
6
7    /// The ID used for the index-names chunk.
8    pub const ID: git_chunk::Id = *b"PNAM";
9
10    ///
11    pub mod decode {
12        use git_object::bstr::BString;
13
14        /// The error returned by [from_bytes()][super::from_bytes()].
15        #[derive(Debug, thiserror::Error)]
16        #[allow(missing_docs)]
17        pub enum Error {
18            #[error("The pack names were not ordered alphabetically.")]
19            NotOrderedAlphabetically,
20            #[error("Each pack path name must be terminated with a null byte")]
21            MissingNullByte,
22            #[error("Couldn't turn path '{path}' into OS path due to encoding issues")]
23            PathEncoding { path: BString },
24            #[error("non-padding bytes found after all paths were read.")]
25            UnknownTrailerBytes,
26        }
27    }
28
29    /// Parse null-separated index names from the given `chunk` of bytes and the expected number of packs and indices.
30    /// Ignore padding bytes which are typically \0.
31    pub fn from_bytes(mut chunk: &[u8], num_packs: u32) -> Result<Vec<PathBuf>, decode::Error> {
32        let mut out = Vec::new();
33        for _ in 0..num_packs {
34            let null_byte_pos = chunk.find_byte(b'\0').ok_or(decode::Error::MissingNullByte)?;
35
36            let path = &chunk[..null_byte_pos];
37            let path = git_path::try_from_byte_slice(path)
38                .map_err(|_| decode::Error::PathEncoding {
39                    path: BString::from(path),
40                })?
41                .to_owned();
42
43            if let Some(previous) = out.last() {
44                if previous >= &path {
45                    return Err(decode::Error::NotOrderedAlphabetically);
46                }
47            }
48            out.push(path);
49
50            chunk = &chunk[null_byte_pos + 1..];
51        }
52
53        if !chunk.is_empty() && !chunk.iter().all(|b| *b == 0) {
54            return Err(decode::Error::UnknownTrailerBytes);
55        }
56        // NOTE: git writes garbage into this chunk, usually extra \0 bytes, which we simply ignore. If we were strict
57        // about it we couldn't read this chunk data at all.
58        Ok(out)
59    }
60
61    /// Calculate the size on disk for our chunk with the given index paths. Note that these are expected to have been processed already
62    /// to actually be file names.
63    pub fn storage_size(paths: impl IntoIterator<Item = impl AsRef<Path>>) -> u64 {
64        let mut count = 0u64;
65        for path in paths {
66            let path = path.as_ref();
67            let ascii_path = path.to_str().expect("UTF-8 compatible paths");
68            assert!(
69                ascii_path.is_ascii(),
70                "must use ascii bytes for correct size computation"
71            );
72            count += (ascii_path.as_bytes().len() + 1/* null byte */) as u64
73        }
74
75        let needed_alignment = CHUNK_ALIGNMENT - (count % CHUNK_ALIGNMENT);
76        if needed_alignment < CHUNK_ALIGNMENT {
77            count += needed_alignment;
78        }
79        count
80    }
81
82    /// Write all `paths` in order to `out`, including padding.
83    pub fn write(
84        paths: impl IntoIterator<Item = impl AsRef<Path>>,
85        mut out: impl std::io::Write,
86    ) -> std::io::Result<()> {
87        let mut written_bytes = 0;
88        for path in paths {
89            let path = path.as_ref().to_str().expect("UTF-8 path");
90            out.write_all(path.as_bytes())?;
91            out.write_all(&[0])?;
92            written_bytes += path.as_bytes().len() as u64 + 1;
93        }
94
95        let needed_alignment = CHUNK_ALIGNMENT - (written_bytes % CHUNK_ALIGNMENT);
96        if needed_alignment < CHUNK_ALIGNMENT {
97            let padding = [0u8; CHUNK_ALIGNMENT as usize];
98            out.write_all(&padding[..needed_alignment as usize])?;
99        }
100        Ok(())
101    }
102
103    const CHUNK_ALIGNMENT: u64 = 4;
104}
105
106/// Information for the chunk with the fanout table
107pub mod fanout {
108    use std::convert::TryInto;
109
110    use crate::multi_index;
111
112    /// The size of the fanout table
113    pub const SIZE: usize = 4 * 256;
114
115    /// The id uniquely identifying the fanout table.
116    pub const ID: git_chunk::Id = *b"OIDF";
117
118    /// Decode the fanout table contained in `chunk`, or return `None` if it didn't have the expected size.
119    pub fn from_bytes(chunk: &[u8]) -> Option<[u32; 256]> {
120        if chunk.len() != SIZE {
121            return None;
122        }
123        let mut out = [0; 256];
124        for (c, f) in chunk.chunks(4).zip(out.iter_mut()) {
125            *f = u32::from_be_bytes(c.try_into().unwrap());
126        }
127        out.into()
128    }
129
130    /// Write the fanout for the given entries, which must be sorted by oid
131    pub(crate) fn write(
132        sorted_entries: &[multi_index::write::Entry],
133        mut out: impl std::io::Write,
134    ) -> std::io::Result<()> {
135        let fanout = crate::index::write::encode::fanout(sorted_entries.iter().map(|e| e.id.first_byte()));
136
137        for value in fanout.iter() {
138            out.write_all(&value.to_be_bytes())?;
139        }
140        Ok(())
141    }
142}
143
144/// Information about the oid lookup table.
145pub mod lookup {
146    use std::ops::Range;
147
148    use crate::multi_index;
149
150    /// The id uniquely identifying the oid lookup table.
151    pub const ID: git_chunk::Id = *b"OIDL";
152
153    /// Return the amount of bytes needed to store the data on disk for the given amount of `entries`
154    pub fn storage_size(entries: usize, object_hash: git_hash::Kind) -> u64 {
155        (entries * object_hash.len_in_bytes()) as u64
156    }
157
158    pub(crate) fn write(
159        sorted_entries: &[multi_index::write::Entry],
160        mut out: impl std::io::Write,
161    ) -> std::io::Result<()> {
162        for entry in sorted_entries {
163            out.write_all(entry.id.as_slice())?;
164        }
165        Ok(())
166    }
167
168    /// Return true if the size of the `offset` range seems to match for a `hash` of the given kind and the amount of objects.
169    pub fn is_valid(offset: &Range<usize>, hash: git_hash::Kind, num_objects: u32) -> bool {
170        (offset.end - offset.start) / hash.len_in_bytes() == num_objects as usize
171    }
172}
173
174/// Information about the offsets table.
175pub mod offsets {
176    use std::{convert::TryInto, ops::Range};
177
178    use crate::multi_index;
179
180    /// The id uniquely identifying the offsets table.
181    pub const ID: git_chunk::Id = *b"OOFF";
182
183    /// Return the amount of bytes needed to offset data for `entries`.
184    pub fn storage_size(entries: usize) -> u64 {
185        (entries * (4 /*pack-id*/ + 4/* pack offset */)) as u64
186    }
187
188    pub(crate) fn write(
189        sorted_entries: &[multi_index::write::Entry],
190        large_offsets_needed: bool,
191        mut out: impl std::io::Write,
192    ) -> std::io::Result<()> {
193        use crate::index::write::encode::{HIGH_BIT, LARGE_OFFSET_THRESHOLD};
194        let mut num_large_offsets = 0u32;
195
196        for entry in sorted_entries {
197            out.write_all(&entry.pack_index.to_be_bytes())?;
198
199            let offset: u32 = if large_offsets_needed {
200                if entry.pack_offset > LARGE_OFFSET_THRESHOLD {
201                    let res = num_large_offsets | HIGH_BIT;
202                    num_large_offsets += 1;
203                    res
204                } else {
205                    entry.pack_offset as u32
206                }
207            } else {
208                entry
209                    .pack_offset
210                    .try_into()
211                    .expect("without large offsets, pack-offset fits u32")
212            };
213            out.write_all(&offset.to_be_bytes())?;
214        }
215        Ok(())
216    }
217
218    /// Returns true if the `offset` range seems to match the size required for `num_objects`.
219    pub fn is_valid(offset: &Range<usize>, num_objects: u32) -> bool {
220        let entry_size = 4 /* pack-id */ + 4 /* pack-offset */;
221        ((offset.end - offset.start) / num_objects as usize) == entry_size
222    }
223}
224
225/// Information about the large offsets table.
226pub mod large_offsets {
227    use std::ops::Range;
228
229    use crate::{index::write::encode::LARGE_OFFSET_THRESHOLD, multi_index};
230
231    /// The id uniquely identifying the large offsets table (with 64 bit offsets)
232    pub const ID: git_chunk::Id = *b"LOFF";
233
234    /// Returns Some(num-large-offset) if there are offsets larger than u32.
235    pub(crate) fn num_large_offsets(entries: &[multi_index::write::Entry]) -> Option<usize> {
236        let mut num_large_offsets = 0;
237        let mut needs_large_offsets = false;
238        for entry in entries {
239            if entry.pack_offset > LARGE_OFFSET_THRESHOLD {
240                num_large_offsets += 1;
241            }
242            if entry.pack_offset > u32::MAX as crate::data::Offset {
243                needs_large_offsets = true;
244            }
245        }
246
247        needs_large_offsets.then_some(num_large_offsets)
248    }
249    /// Returns true if the `offsets` range seems to be properly aligned for the data we expect.
250    pub fn is_valid(offset: &Range<usize>) -> bool {
251        (offset.end - offset.start) % 8 == 0
252    }
253
254    pub(crate) fn write(
255        sorted_entries: &[multi_index::write::Entry],
256        mut num_large_offsets: usize,
257        mut out: impl std::io::Write,
258    ) -> std::io::Result<()> {
259        for offset in sorted_entries
260            .iter()
261            .filter_map(|e| (e.pack_offset > LARGE_OFFSET_THRESHOLD).then_some(e.pack_offset))
262        {
263            out.write_all(&offset.to_be_bytes())?;
264            num_large_offsets = num_large_offsets
265                .checked_sub(1)
266                .expect("BUG: wrote more offsets the previously found");
267        }
268        assert_eq!(num_large_offsets, 0, "BUG: wrote less offsets than initially counted");
269        Ok(())
270    }
271
272    /// Return the amount of bytes needed to store the given amount of `large_offsets`
273    pub(crate) fn storage_size(large_offsets: usize) -> u64 {
274        8 * large_offsets as u64
275    }
276}