git-pack 0.31.0

Please use `gix-<thiscrate>` instead ('git' -> 'gix')
Documentation
use std::sync::atomic::AtomicBool;

use git_features::progress::Progress;
use git_object::{bstr::ByteSlice, WriteTo};

use crate::index;

///
pub mod integrity {
    use std::marker::PhantomData;

    use git_object::bstr::BString;

    /// Returned by [`index::File::verify_integrity()`][crate::index::File::verify_integrity()].
    #[derive(thiserror::Error, Debug)]
    #[allow(missing_docs)]
    pub enum Error {
        #[error("The fan at index {index} is out of order as it's larger then the following value.")]
        Fan { index: usize },
        #[error("{kind} object {id} could not be decoded")]
        ObjectDecode {
            source: git_object::decode::Error,
            kind: git_object::Kind,
            id: git_hash::ObjectId,
        },
        #[error("{kind} object {id} wasn't re-encoded without change, wanted\n{expected}\n\nGOT\n\n{actual}")]
        ObjectEncodeMismatch {
            kind: git_object::Kind,
            id: git_hash::ObjectId,
            expected: BString,
            actual: BString,
        },
    }

    /// Returned by [`index::File::verify_integrity()`][crate::index::File::verify_integrity()].
    pub struct Outcome<P> {
        /// The computed checksum of the index which matched the stored one.
        pub actual_index_checksum: git_hash::ObjectId,
        /// The packs traversal outcome, if one was provided
        pub pack_traverse_statistics: Option<crate::index::traverse::Statistics>,
        /// The provided progress instance.
        pub progress: P,
    }

    /// Additional options to define how the integrity should be verified.
    #[derive(Clone)]
    pub struct Options<F> {
        /// The thoroughness of the verification
        pub verify_mode: crate::index::verify::Mode,
        /// The way to traverse packs
        pub traversal: crate::index::traverse::Algorithm,
        /// The amount of threads to use of `Some(N)`, with `None|Some(0)` using all available cores are used.
        pub thread_limit: Option<usize>,
        /// A function to create a pack cache
        pub make_pack_lookup_cache: F,
    }

    impl Default for Options<fn() -> crate::cache::Never> {
        fn default() -> Self {
            Options {
                verify_mode: Default::default(),
                traversal: Default::default(),
                thread_limit: None,
                make_pack_lookup_cache: || crate::cache::Never,
            }
        }
    }

    /// The progress ids used in [`index::File::verify_integrity()`][crate::index::File::verify_integrity()].
    ///
    /// Use this information to selectively extract the progress of interest in case the parent application has custom visualization.
    #[derive(Debug, Copy, Clone)]
    pub enum ProgressId {
        /// The amount of bytes read to verify the index checksum.
        ChecksumBytes,
        /// A root progress for traversal which isn't actually used directly, but here to link to the respective `ProgressId` types.
        Traverse(PhantomData<crate::index::verify::index::traverse::ProgressId>),
    }

    impl From<ProgressId> for git_features::progress::Id {
        fn from(v: ProgressId) -> Self {
            match v {
                ProgressId::ChecksumBytes => *b"PTHI",
                ProgressId::Traverse(_) => git_features::progress::UNKNOWN,
            }
        }
    }
}

///
pub mod checksum {
    /// Returned by [`index::File::verify_checksum()`][crate::index::File::verify_checksum()].
    pub type Error = crate::verify::checksum::Error;
}

/// Various ways in which a pack and index can be verified
#[derive(Debug, Eq, PartialEq, Hash, Clone, Copy)]
pub enum Mode {
    /// Validate the object hash and CRC32
    HashCrc32,
    /// Validate hash and CRC32, and decode each non-Blob object.
    /// Each object should be valid, i.e. be decodable.
    HashCrc32Decode,
    /// Validate hash and CRC32, and decode and encode each non-Blob object.
    /// Each object should yield exactly the same hash when re-encoded.
    HashCrc32DecodeEncode,
}

impl Default for Mode {
    fn default() -> Self {
        Mode::HashCrc32DecodeEncode
    }
}

/// Information to allow verifying the integrity of an index with the help of its corresponding pack.
pub struct PackContext<'a, F> {
    /// The pack data file itself.
    pub data: &'a crate::data::File,
    /// The options further configuring the pack traversal and verification
    pub options: integrity::Options<F>,
}

/// Verify and validate the content of the index file
impl index::File {
    /// Returns the trailing hash stored at the end of this index file.
    ///
    /// It's a hash over all bytes of the index.
    pub fn index_checksum(&self) -> git_hash::ObjectId {
        git_hash::ObjectId::from(&self.data[self.data.len() - self.hash_len..])
    }

    /// Returns the hash of the pack data file that this index file corresponds to.
    ///
    /// It should [`crate::data::File::checksum()`] of the corresponding pack data file.
    pub fn pack_checksum(&self) -> git_hash::ObjectId {
        let from = self.data.len() - self.hash_len * 2;
        git_hash::ObjectId::from(&self.data[from..][..self.hash_len])
    }

    /// Validate that our [`index_checksum()`][index::File::index_checksum()] matches the actual contents
    /// of this index file, and return it if it does.
    pub fn verify_checksum(
        &self,
        progress: impl Progress,
        should_interrupt: &AtomicBool,
    ) -> Result<git_hash::ObjectId, checksum::Error> {
        crate::verify::checksum_on_disk_or_mmap(
            self.path(),
            &self.data,
            self.index_checksum(),
            self.object_hash,
            progress,
            should_interrupt,
        )
    }

    /// The most thorough validation of integrity of both index file and the corresponding pack data file, if provided.
    /// Returns the checksum of the index file, the traversal outcome and the given progress if the integrity check is successful.
    ///
    /// If `pack` is provided, it is expected (and validated to be) the pack belonging to this index.
    /// It will be used to validate internal integrity of the pack before checking each objects integrity
    /// is indeed as advertised via its SHA1 as stored in this index, as well as the CRC32 hash.
    /// The last member of the Option is a function returning an implementation of [`crate::cache::DecodeEntry`] to be used if
    /// the [`index::traverse::Algorithm`] is `Lookup`.
    /// To set this to `None`, use `None::<(_, _, _, fn() -> crate::cache::Never)>`.
    ///
    /// The `thread_limit` optionally specifies the amount of threads to be used for the [pack traversal][index::File::traverse()].
    /// `make_cache` is only used in case a `pack` is specified, use existing implementations in the [`crate::cache`] module.
    ///
    /// # Tradeoffs
    ///
    /// The given `progress` is inevitably consumed if there is an error, which is a tradeoff chosen to easily allow using `?` in the
    /// error case.
    pub fn verify_integrity<P, C, F>(
        &self,
        pack: Option<PackContext<'_, F>>,
        mut progress: P,
        should_interrupt: &AtomicBool,
    ) -> Result<integrity::Outcome<P>, index::traverse::Error<index::verify::integrity::Error>>
    where
        P: Progress,
        C: crate::cache::DecodeEntry,
        F: Fn() -> C + Send + Clone,
    {
        if let Some(first_invalid) = crate::verify::fan(&self.fan) {
            return Err(index::traverse::Error::Processor(integrity::Error::Fan {
                index: first_invalid,
            }));
        }

        match pack {
            Some(PackContext {
                data: pack,
                options:
                    integrity::Options {
                        verify_mode,
                        traversal,
                        thread_limit,
                        make_pack_lookup_cache,
                    },
            }) => self
                .traverse(
                    pack,
                    progress,
                    should_interrupt,
                    || {
                        let mut encode_buf = Vec::with_capacity(2048);
                        move |kind, data, index_entry, progress| {
                            Self::verify_entry(verify_mode, &mut encode_buf, kind, data, index_entry, progress)
                        }
                    },
                    index::traverse::Options {
                        traversal,
                        thread_limit,
                        check: index::traverse::SafetyCheck::All,
                        make_pack_lookup_cache,
                    },
                )
                .map(|o| integrity::Outcome {
                    actual_index_checksum: o.actual_index_checksum,
                    pack_traverse_statistics: Some(o.statistics),
                    progress: o.progress,
                }),
            None => self
                .verify_checksum(
                    progress.add_child_with_id("Sha1 of index", integrity::ProgressId::ChecksumBytes.into()),
                    should_interrupt,
                )
                .map_err(Into::into)
                .map(|id| integrity::Outcome {
                    actual_index_checksum: id,
                    pack_traverse_statistics: None,
                    progress,
                }),
        }
    }

    #[allow(clippy::too_many_arguments)]
    fn verify_entry<P>(
        verify_mode: Mode,
        encode_buf: &mut Vec<u8>,
        object_kind: git_object::Kind,
        buf: &[u8],
        index_entry: &index::Entry,
        progress: &mut P,
    ) -> Result<(), integrity::Error>
    where
        P: Progress,
    {
        if let Mode::HashCrc32Decode | Mode::HashCrc32DecodeEncode = verify_mode {
            use git_object::Kind::*;
            match object_kind {
                Tree | Commit | Tag => {
                    let object = git_object::ObjectRef::from_bytes(object_kind, buf).map_err(|err| {
                        integrity::Error::ObjectDecode {
                            source: err,
                            kind: object_kind,
                            id: index_entry.oid,
                        }
                    })?;
                    if let Mode::HashCrc32DecodeEncode = verify_mode {
                        encode_buf.clear();
                        object
                            .write_to(&mut *encode_buf)
                            .expect("writing to a memory buffer never fails");
                        if encode_buf.as_slice() != buf {
                            let mut should_return_error = true;
                            if let git_object::Kind::Tree = object_kind {
                                if buf.as_bstr().find(b"100664").is_some() || buf.as_bstr().find(b"100640").is_some() {
                                    progress.info(format!("Tree object {} would be cleaned up during re-serialization, replacing mode '100664|100640' with '100644'", index_entry.oid));
                                    should_return_error = false
                                }
                            }
                            if should_return_error {
                                return Err(integrity::Error::ObjectEncodeMismatch {
                                    kind: object_kind,
                                    id: index_entry.oid,
                                    expected: buf.into(),
                                    actual: encode_buf.clone().into(),
                                });
                            }
                        }
                    }
                }
                Blob => {}
            };
        }
        Ok(())
    }
}