1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
use crate::{pack, pack::index::access::PackOffset, pack::tree::Tree};
use git_features::{
    interrupt::is_triggered,
    progress::{self, Progress},
};
use std::{
    fs, io,
    io::{BufRead, Read},
    time::Instant,
};

/// Returned by [`Tree::from_offsets_in_pack()`]
#[derive(thiserror::Error, Debug)]
#[allow(missing_docs)]
pub enum Error {
    #[error("{message}")]
    Io { source: io::Error, message: &'static str },
    #[error(transparent)]
    Header(#[from] pack::data::header::decode::Error),
    #[error("Could find object with id {id} in this pack. Thin packs are not supported")]
    UnresolvedRefDelta { id: git_hash::ObjectId },
    #[error(transparent)]
    Tree(#[from] pack::tree::Error),
    #[error("Interrupted")]
    Interrupted,
}

const PACK_HEADER_LEN: usize = 12;

/// Generate tree from certain input
impl<T> Tree<T> {
    /// Create a new `Tree` from any data sorted by offset, ascending as returned by the `data_sorted_by_offsets` iterator.
    /// * `get_pack_offset(item: &T`) -> PackOffset` is a function returning the pack offset of the given item, which can be used
    /// for obtaining the objects entry within the pack.
    /// * `pack_path` is the path to the pack file itself and from which to read the entry data, which is a pack file matching the offsets
    /// returned by `get_pack_offset(…)`.
    /// * `progress` is used to track progress when creating the tree.
    /// * `resolve_in_pack_id(git_hash::oid) -> Option<PackOffset>` takes an object ID and tries to resolve it to an object within this pack if
    /// possible. Failing to do so aborts the operation, and this function is not expected to be called in usual packs. It's a theoretical
    /// possibility though.
    ///
    /// Note that the sort order is ascending. The given pack file path must match the provided offsets.
    pub fn from_offsets_in_pack(
        data_sorted_by_offsets: impl Iterator<Item = T>,
        get_pack_offset: impl Fn(&T) -> PackOffset,
        pack_path: impl AsRef<std::path::Path>,
        mut progress: impl Progress,
        resolve_in_pack_id: impl Fn(&git_hash::oid) -> Option<PackOffset>,
    ) -> Result<Self, Error> {
        let mut r = io::BufReader::with_capacity(
            8192 * 8, // this value directly corresponds to performance, 8k (default) is about 4x slower than 64k
            fs::File::open(pack_path).map_err(|err| Error::Io {
                source: err,
                message: "open pack path",
            })?,
        );

        let anticpiated_num_objects = if let Some(num_objects) = data_sorted_by_offsets.size_hint().1 {
            progress.init(Some(num_objects), progress::count("objects"));
            num_objects
        } else {
            0
        };
        let mut tree = Tree::with_capacity(anticpiated_num_objects)?;

        {
            // safety check - assure ourselves it's a pack we can handle
            let mut buf = [0u8; PACK_HEADER_LEN];
            r.read_exact(&mut buf).map_err(|err| Error::Io {
                source: err,
                message: "reading header buffer with at least 12 bytes failed - pack file truncated?",
            })?;
            pack::data::header::decode(&buf)?;
        }

        let then = Instant::now();

        let mut previous_cursor_position = None::<u64>;

        for (idx, data) in data_sorted_by_offsets.enumerate() {
            let pack_offset = get_pack_offset(&data);
            if let Some(previous_offset) = previous_cursor_position {
                Self::advance_cursor_to_pack_offset(&mut r, pack_offset, previous_offset)?;
            };
            let entry = pack::data::Entry::from_read(&mut r, pack_offset).map_err(|err| Error::Io {
                source: err,
                message: "EOF while parsing header",
            })?;
            previous_cursor_position = Some(pack_offset + entry.header_size() as u64);

            use pack::data::entry::Header::*;
            match entry.header {
                Tree | Blob | Commit | Tag => {
                    tree.add_root(pack_offset, data)?;
                }
                RefDelta { base_id } => {
                    resolve_in_pack_id(base_id.as_ref())
                        .ok_or(Error::UnresolvedRefDelta { id: base_id })
                        .and_then(|base_pack_offset| {
                            tree.add_child(base_pack_offset, pack_offset, data).map_err(Into::into)
                        })?;
                }
                OfsDelta { base_distance } => {
                    let base_pack_offset = pack_offset
                        .checked_sub(base_distance)
                        .expect("in bound distance for deltas");
                    tree.add_child(base_pack_offset, pack_offset, data)?;
                }
            };
            progress.inc();
            if idx % 10_000 == 0 && is_triggered() {
                return Err(Error::Interrupted);
            }
        }

        progress.show_throughput(then);
        Ok(tree)
    }

    fn advance_cursor_to_pack_offset(
        r: &mut io::BufReader<fs::File>,
        pack_offset: u64,
        previous_offset: u64,
    ) -> Result<(), Error> {
        let mut bytes_to_skip = pack_offset
            .checked_sub(previous_offset)
            .expect("continuously ascending pack offets") as usize;
        while bytes_to_skip != 0 {
            let buf = r.fill_buf().map_err(|err| Error::Io {
                source: err,
                message: "skip bytes",
            })?;
            if buf.is_empty() {
                // This means we have reached the end of file and can't make progress anymore, before we have satisfied our need
                // for more
                return Err(Error::Io {
                    source: io::Error::new(
                        io::ErrorKind::UnexpectedEof,
                        "ran out of bytes before reading desired amount of bytes",
                    ),
                    message: "index file is damaged or corrupt",
                });
            }
            let bytes = buf.len().min(bytes_to_skip);
            r.consume(bytes);
            bytes_to_skip -= bytes;
        }
        Ok(())
    }
}