gix_commitgraph/file/
verify.rs

1//! Auxiliary types used in commit graph file verification methods.
2use std::{
3    cmp::{max, min},
4    collections::HashMap,
5    path::Path,
6};
7
8use crate::{file, File, GENERATION_NUMBER_INFINITY, GENERATION_NUMBER_MAX};
9
10/// The error used in [`File::traverse()`].
11#[derive(thiserror::Error, Debug)]
12#[allow(missing_docs)]
13pub enum Error<E: std::error::Error + 'static> {
14    #[error(transparent)]
15    Commit(#[from] file::commit::Error),
16    #[error("commit at file position {pos} has invalid ID {id}")]
17    CommitId {
18        id: gix_hash::ObjectId,
19        pos: file::Position,
20    },
21    #[error("commit at file position {pos} with ID {id} is out of order relative to its predecessor with ID {predecessor_id}")]
22    CommitsOutOfOrder {
23        id: gix_hash::ObjectId,
24        pos: file::Position,
25        predecessor_id: gix_hash::ObjectId,
26    },
27    #[error("commit-graph filename should be {0}")]
28    Filename(String),
29    #[error("commit {id} has invalid generation {generation}")]
30    Generation { generation: u32, id: gix_hash::ObjectId },
31    #[error("checksum mismatch: expected {expected}, got {actual}")]
32    Mismatch {
33        actual: gix_hash::ObjectId,
34        expected: gix_hash::ObjectId,
35    },
36    #[error("{0}")]
37    Processor(#[source] E),
38    #[error("commit {id} has invalid root tree ID {root_tree_id}")]
39    RootTreeId {
40        id: gix_hash::ObjectId,
41        root_tree_id: gix_hash::ObjectId,
42    },
43}
44
45/// The positive result of [`File::traverse()`] providing some statistical information.
46#[derive(Clone, Debug, Eq, PartialEq)]
47#[cfg_attr(feature = "serde", derive(serde::Deserialize, serde::Serialize))]
48pub struct Outcome {
49    /// The largest encountered [`file::Commit`] generation number.
50    pub max_generation: u32,
51    /// The smallest encountered [`file::Commit`] generation number.
52    pub min_generation: u32,
53    /// The largest number of parents in a single [`file::Commit`].
54    pub max_parents: u32,
55    /// The total number of [`commits`][file::Commit]s seen in the iteration.
56    pub num_commits: u32,
57    /// A mapping of `N -> number of commits with N parents`.
58    pub parent_counts: HashMap<u32, u32>,
59}
60
61/// Verification
62impl File {
63    /// Returns the trailing checksum over the entire content of this file.
64    pub fn checksum(&self) -> &gix_hash::oid {
65        gix_hash::oid::from_bytes_unchecked(&self.data[self.data.len() - self.hash_len..])
66    }
67
68    /// Traverse all [commits][file::Commit] stored in this file and call `processor(commit) -> Result<(), Error>` on it.
69    ///
70    /// If the `processor` fails, the iteration will be stopped and the entire call results in the respective error.
71    pub fn traverse<'a, E, Processor>(&'a self, mut processor: Processor) -> Result<Outcome, Error<E>>
72    where
73        E: std::error::Error + 'static,
74        Processor: FnMut(&file::Commit<'a>) -> Result<(), E>,
75    {
76        self.verify_checksum()
77            .map_err(|(actual, expected)| Error::Mismatch { actual, expected })?;
78        verify_split_chain_filename_hash(&self.path, self.checksum()).map_err(Error::Filename)?;
79
80        let null_id = self.object_hash().null_ref();
81
82        let mut stats = Outcome {
83            max_generation: 0,
84            max_parents: 0,
85            min_generation: GENERATION_NUMBER_INFINITY,
86            num_commits: self.num_commits(),
87            parent_counts: HashMap::new(),
88        };
89
90        // TODO: Verify self.fan values as we go.
91        let mut prev_id: &gix_hash::oid = null_id;
92        for commit in self.iter_commits() {
93            if commit.id() <= prev_id {
94                if commit.id() == null_id {
95                    return Err(Error::CommitId {
96                        pos: commit.position(),
97                        id: commit.id().into(),
98                    });
99                }
100                return Err(Error::CommitsOutOfOrder {
101                    pos: commit.position(),
102                    id: commit.id().into(),
103                    predecessor_id: prev_id.into(),
104                });
105            }
106            if commit.root_tree_id() == null_id {
107                return Err(Error::RootTreeId {
108                    id: commit.id().into(),
109                    root_tree_id: commit.root_tree_id().into(),
110                });
111            }
112            if commit.generation() > GENERATION_NUMBER_MAX {
113                return Err(Error::Generation {
114                    generation: commit.generation(),
115                    id: commit.id().into(),
116                });
117            }
118
119            processor(&commit).map_err(Error::Processor)?;
120
121            stats.max_generation = max(stats.max_generation, commit.generation());
122            stats.min_generation = min(stats.min_generation, commit.generation());
123            let parent_count = commit
124                .iter_parents()
125                .try_fold(0u32, |acc, pos| pos.map(|_| acc + 1))
126                .map_err(Error::Commit)?;
127            *stats.parent_counts.entry(parent_count).or_insert(0) += 1;
128            prev_id = commit.id();
129        }
130
131        if stats.min_generation == GENERATION_NUMBER_INFINITY {
132            stats.min_generation = 0;
133        }
134
135        Ok(stats)
136    }
137
138    /// Assure the [`checksum`][File::checksum()] matches the actual checksum over all content of this file, excluding the trailing
139    /// checksum itself.
140    ///
141    /// Return the actual checksum on success or `(actual checksum, expected checksum)` if there is a mismatch.
142    pub fn verify_checksum(&self) -> Result<gix_hash::ObjectId, (gix_hash::ObjectId, gix_hash::ObjectId)> {
143        // Even though we could use gix_features::hash::bytes_of_file(…), this would require using our own
144        // Error type to support io::Error and Mismatch. As we only gain progress, there probably isn't much value
145        // as these files are usually small enough to process them in less than a second, even for the large ones.
146        // But it's possible, once a progress instance is passed.
147        let data_len_without_trailer = self.data.len() - self.hash_len;
148        let mut hasher = gix_features::hash::hasher(self.object_hash());
149        hasher.update(&self.data[..data_len_without_trailer]);
150        let actual = gix_hash::ObjectId::from_bytes_or_panic(hasher.digest().as_ref());
151
152        let expected = self.checksum();
153        if actual == expected {
154            Ok(actual)
155        } else {
156            Err((actual, expected.into()))
157        }
158    }
159}
160
161/// If the given path's filename matches "graph-{hash}.graph", check that `hash` matches the
162/// expected hash.
163fn verify_split_chain_filename_hash(path: &Path, expected: &gix_hash::oid) -> Result<(), String> {
164    path.file_name()
165        .and_then(std::ffi::OsStr::to_str)
166        .and_then(|filename| filename.strip_suffix(".graph"))
167        .and_then(|stem| stem.strip_prefix("graph-"))
168        .map_or(Ok(()), |hex| match gix_hash::ObjectId::from_hex(hex.as_bytes()) {
169            Ok(actual) if actual == expected => Ok(()),
170            _ => Err(format!("graph-{}.graph", expected.to_hex())),
171        })
172}