git_commitgraph/file/
verify.rs

1//! Auxiliary types used in commit graph file verification methods.
2use std::{
3    cmp::{max, min},
4    collections::HashMap,
5    path::Path,
6};
7
8use crate::{
9    file::{self, File},
10    GENERATION_NUMBER_INFINITY, GENERATION_NUMBER_MAX,
11};
12
13/// The error used in [`File::traverse()`].
14#[derive(thiserror::Error, Debug)]
15#[allow(missing_docs)]
16pub enum Error<E: std::error::Error + 'static> {
17    #[error(transparent)]
18    Commit(#[from] file::commit::Error),
19    #[error("commit at file position {pos} has invalid ID {id}")]
20    CommitId {
21        id: git_hash::ObjectId,
22        pos: file::Position,
23    },
24    #[error("commit at file position {pos} with ID {id} is out of order relative to its predecessor with ID {predecessor_id}")]
25    CommitsOutOfOrder {
26        id: git_hash::ObjectId,
27        pos: file::Position,
28        predecessor_id: git_hash::ObjectId,
29    },
30    #[error("commit-graph filename should be {0}")]
31    Filename(String),
32    #[error("commit {id} has invalid generation {generation}")]
33    Generation { generation: u32, id: git_hash::ObjectId },
34    #[error("checksum mismatch: expected {expected}, got {actual}")]
35    Mismatch {
36        actual: git_hash::ObjectId,
37        expected: git_hash::ObjectId,
38    },
39    #[error("{0}")]
40    Processor(#[source] E),
41    #[error("commit {id} has invalid root tree ID {root_tree_id}")]
42    RootTreeId {
43        id: git_hash::ObjectId,
44        root_tree_id: git_hash::ObjectId,
45    },
46}
47
48/// The positive result of [`File::traverse()`] providing some statistical information.
49#[derive(Clone, Debug, Eq, PartialEq)]
50#[cfg_attr(feature = "serde1", derive(serde::Deserialize, serde::Serialize))]
51pub struct Outcome {
52    /// The largest encountered [`file::Commit`] generation number.
53    pub max_generation: u32,
54    /// The smallest encountered [`file::Commit`] generation number.
55    pub min_generation: u32,
56    /// The largest number of parents in a single [`file::Commit`].
57    pub max_parents: u32,
58    /// The total number of [`commits`][file::Commit]s seen in the iteration.
59    pub num_commits: u32,
60    /// A mapping of `N -> number of commits with N parents`.
61    pub parent_counts: HashMap<u32, u32>,
62}
63
64/// Verification
65impl File {
66    /// Returns the trailing checksum over the entire content of this file.
67    pub fn checksum(&self) -> &git_hash::oid {
68        git_hash::oid::from_bytes_unchecked(&self.data[self.data.len() - self.hash_len..])
69    }
70
71    /// Traverse all [commits][file::Commit] stored in this file and call `processor(commit) -> Result<(), Error>` on it.
72    ///
73    /// If the `processor` fails, the iteration will be stopped and the entire call results in the respective error.
74    pub fn traverse<'a, E, Processor>(&'a self, mut processor: Processor) -> Result<Outcome, Error<E>>
75    where
76        E: std::error::Error + 'static,
77        Processor: FnMut(&file::Commit<'a>) -> Result<(), E>,
78    {
79        self.verify_checksum()
80            .map_err(|(actual, expected)| Error::Mismatch { actual, expected })?;
81        verify_split_chain_filename_hash(&self.path, self.checksum()).map_err(Error::Filename)?;
82
83        let null_id = self.object_hash().null_ref();
84
85        let mut stats = Outcome {
86            max_generation: 0,
87            max_parents: 0,
88            min_generation: GENERATION_NUMBER_INFINITY,
89            num_commits: self.num_commits(),
90            parent_counts: HashMap::new(),
91        };
92
93        // TODO: Verify self.fan values as we go.
94        let mut prev_id: &git_hash::oid = null_id;
95        for commit in self.iter_commits() {
96            if commit.id() <= prev_id {
97                if commit.id() == null_id {
98                    return Err(Error::CommitId {
99                        pos: commit.position(),
100                        id: commit.id().into(),
101                    });
102                }
103                return Err(Error::CommitsOutOfOrder {
104                    pos: commit.position(),
105                    id: commit.id().into(),
106                    predecessor_id: prev_id.into(),
107                });
108            }
109            if commit.root_tree_id() == null_id {
110                return Err(Error::RootTreeId {
111                    id: commit.id().into(),
112                    root_tree_id: commit.root_tree_id().into(),
113                });
114            }
115            if commit.generation() > GENERATION_NUMBER_MAX {
116                return Err(Error::Generation {
117                    generation: commit.generation(),
118                    id: commit.id().into(),
119                });
120            }
121
122            processor(&commit).map_err(Error::Processor)?;
123
124            stats.max_generation = max(stats.max_generation, commit.generation());
125            stats.min_generation = min(stats.min_generation, commit.generation());
126            let parent_count = commit
127                .iter_parents()
128                .try_fold(0u32, |acc, pos| pos.map(|_| acc + 1))
129                .map_err(Error::Commit)?;
130            *stats.parent_counts.entry(parent_count).or_insert(0) += 1;
131            prev_id = commit.id();
132        }
133
134        if stats.min_generation == GENERATION_NUMBER_INFINITY {
135            stats.min_generation = 0;
136        }
137
138        Ok(stats)
139    }
140
141    /// Assure the [`checksum`][File::checksum()] matches the actual checksum over all content of this file, excluding the trailing
142    /// checksum itself.
143    ///
144    /// Return the actual checksum on success or `(actual checksum, expected checksum)` if there is a mismatch.
145    pub fn verify_checksum(&self) -> Result<git_hash::ObjectId, (git_hash::ObjectId, git_hash::ObjectId)> {
146        // Even though we could use git_features::hash::bytes_of_file(…), this would require using our own
147        // Error type to support io::Error and Mismatch. As we only gain progress, there probably isn't much value
148        // as these files are usually small enough to process them in less than a second, even for the large ones.
149        // But it's possible, once a progress instance is passed.
150        let data_len_without_trailer = self.data.len() - self.hash_len;
151        let mut hasher = git_features::hash::hasher(self.object_hash());
152        hasher.update(&self.data[..data_len_without_trailer]);
153        let actual = git_hash::ObjectId::from(hasher.digest().as_ref());
154
155        let expected = self.checksum();
156        if actual == expected {
157            Ok(actual)
158        } else {
159            Err((actual, expected.into()))
160        }
161    }
162}
163
164/// If the given path's filename matches "graph-{hash}.graph", check that `hash` matches the
165/// expected hash.
166fn verify_split_chain_filename_hash(path: impl AsRef<Path>, expected: &git_hash::oid) -> Result<(), String> {
167    let path = path.as_ref();
168    path.file_name()
169        .and_then(|filename| filename.to_str())
170        .and_then(|filename| filename.strip_suffix(".graph"))
171        .and_then(|stem| stem.strip_prefix("graph-"))
172        .map_or(Ok(()), |hex| match git_hash::ObjectId::from_hex(hex.as_bytes()) {
173            Ok(actual) if actual == expected => Ok(()),
174            _ => Err(format!("graph-{}.graph", expected.to_hex())),
175        })
176}