gix_commitgraph/file/
verify.rs

1//! Auxiliary types used in commit graph file verification methods.
2use std::{
3    cmp::{max, min},
4    collections::HashMap,
5    path::Path,
6};
7
8use crate::{file, File, GENERATION_NUMBER_INFINITY, GENERATION_NUMBER_MAX};
9
10/// The error used in [`File::traverse()`].
11#[derive(thiserror::Error, Debug)]
12#[allow(missing_docs)]
13pub enum Error<E: std::error::Error + 'static> {
14    #[error(transparent)]
15    Commit(#[from] file::commit::Error),
16    #[error("commit at file position {pos} has invalid ID {id}")]
17    CommitId {
18        id: gix_hash::ObjectId,
19        pos: file::Position,
20    },
21    #[error("commit at file position {pos} with ID {id} is out of order relative to its predecessor with ID {predecessor_id}")]
22    CommitsOutOfOrder {
23        id: gix_hash::ObjectId,
24        pos: file::Position,
25        predecessor_id: gix_hash::ObjectId,
26    },
27    #[error("commit-graph filename should be {0}")]
28    Filename(String),
29    #[error("commit {id} has invalid generation {generation}")]
30    Generation { generation: u32, id: gix_hash::ObjectId },
31    #[error(transparent)]
32    Checksum(#[from] checksum::Error),
33    #[error("{0}")]
34    Processor(#[source] E),
35    #[error("commit {id} has invalid root tree ID {root_tree_id}")]
36    RootTreeId {
37        id: gix_hash::ObjectId,
38        root_tree_id: gix_hash::ObjectId,
39    },
40}
41
42///
43pub mod checksum {
44    /// The error used in [`super::File::verify_checksum()`].
45    #[derive(thiserror::Error, Debug)]
46    #[allow(missing_docs)]
47    pub enum Error {
48        #[error("failed to hash commit graph file")]
49        Hasher(#[from] gix_hash::hasher::Error),
50        #[error(transparent)]
51        Verify(#[from] gix_hash::verify::Error),
52    }
53}
54
55/// The positive result of [`File::traverse()`] providing some statistical information.
56#[derive(Clone, Debug, Eq, PartialEq)]
57#[cfg_attr(feature = "serde", derive(serde::Deserialize, serde::Serialize))]
58pub struct Outcome {
59    /// The largest encountered [`file::Commit`] generation number.
60    pub max_generation: u32,
61    /// The smallest encountered [`file::Commit`] generation number.
62    pub min_generation: u32,
63    /// The largest number of parents in a single [`file::Commit`].
64    pub max_parents: u32,
65    /// The total number of [`commits`][file::Commit]s seen in the iteration.
66    pub num_commits: u32,
67    /// A mapping of `N -> number of commits with N parents`.
68    pub parent_counts: HashMap<u32, u32>,
69}
70
71/// Verification
72impl File {
73    /// Returns the trailing checksum over the entire content of this file.
74    pub fn checksum(&self) -> &gix_hash::oid {
75        gix_hash::oid::from_bytes_unchecked(&self.data[self.data.len() - self.hash_len..])
76    }
77
78    /// Traverse all [commits][file::Commit] stored in this file and call `processor(commit) -> Result<(), Error>` on it.
79    ///
80    /// If the `processor` fails, the iteration will be stopped and the entire call results in the respective error.
81    pub fn traverse<'a, E, Processor>(&'a self, mut processor: Processor) -> Result<Outcome, Error<E>>
82    where
83        E: std::error::Error + 'static,
84        Processor: FnMut(&file::Commit<'a>) -> Result<(), E>,
85    {
86        self.verify_checksum()?;
87        verify_split_chain_filename_hash(&self.path, self.checksum()).map_err(Error::Filename)?;
88
89        let null_id = self.object_hash().null_ref();
90
91        let mut stats = Outcome {
92            max_generation: 0,
93            max_parents: 0,
94            min_generation: GENERATION_NUMBER_INFINITY,
95            num_commits: self.num_commits(),
96            parent_counts: HashMap::new(),
97        };
98
99        // TODO: Verify self.fan values as we go.
100        let mut prev_id: &gix_hash::oid = null_id;
101        for commit in self.iter_commits() {
102            if commit.id() <= prev_id {
103                if commit.id() == null_id {
104                    return Err(Error::CommitId {
105                        pos: commit.position(),
106                        id: commit.id().into(),
107                    });
108                }
109                return Err(Error::CommitsOutOfOrder {
110                    pos: commit.position(),
111                    id: commit.id().into(),
112                    predecessor_id: prev_id.into(),
113                });
114            }
115            if commit.root_tree_id() == null_id {
116                return Err(Error::RootTreeId {
117                    id: commit.id().into(),
118                    root_tree_id: commit.root_tree_id().into(),
119                });
120            }
121            if commit.generation() > GENERATION_NUMBER_MAX {
122                return Err(Error::Generation {
123                    generation: commit.generation(),
124                    id: commit.id().into(),
125                });
126            }
127
128            processor(&commit).map_err(Error::Processor)?;
129
130            stats.max_generation = max(stats.max_generation, commit.generation());
131            stats.min_generation = min(stats.min_generation, commit.generation());
132            let parent_count = commit
133                .iter_parents()
134                .try_fold(0u32, |acc, pos| pos.map(|_| acc + 1))
135                .map_err(Error::Commit)?;
136            *stats.parent_counts.entry(parent_count).or_insert(0) += 1;
137            prev_id = commit.id();
138        }
139
140        if stats.min_generation == GENERATION_NUMBER_INFINITY {
141            stats.min_generation = 0;
142        }
143
144        Ok(stats)
145    }
146
147    /// Assure the [`checksum`][File::checksum()] matches the actual checksum over all content of this file, excluding the trailing
148    /// checksum itself.
149    ///
150    /// Return the actual checksum on success or [`checksum::Error`] if there is a mismatch.
151    pub fn verify_checksum(&self) -> Result<gix_hash::ObjectId, checksum::Error> {
152        // Even though we could use gix_hash::bytes_of_file(…), this would require extending our
153        // Error type to support io::Error. As we only gain progress, there probably isn't much value
154        // as these files are usually small enough to process them in less than a second, even for the large ones.
155        // But it's possible, once a progress instance is passed.
156        let data_len_without_trailer = self.data.len() - self.hash_len;
157        let mut hasher = gix_hash::hasher(self.object_hash());
158        hasher.update(&self.data[..data_len_without_trailer]);
159        let actual = hasher.try_finalize()?;
160        actual.verify(self.checksum())?;
161        Ok(actual)
162    }
163}
164
165/// If the given path's filename matches "graph-{hash}.graph", check that `hash` matches the
166/// expected hash.
167fn verify_split_chain_filename_hash(path: &Path, expected: &gix_hash::oid) -> Result<(), String> {
168    path.file_name()
169        .and_then(std::ffi::OsStr::to_str)
170        .and_then(|filename| filename.strip_suffix(".graph"))
171        .and_then(|stem| stem.strip_prefix("graph-"))
172        .map_or(Ok(()), |hex| match gix_hash::ObjectId::from_hex(hex.as_bytes()) {
173            Ok(actual) if actual == expected => Ok(()),
174            _ => Err(format!("graph-{}.graph", expected.to_hex())),
175        })
176}