git-bug 0.2.4 - Docs.rs

// git-bug-rs - A rust library for interfacing with git-bug repositories
//
// Copyright (C) 2025 Benedikt Peetz <benedikt.peetz@b-peetz.de>
// SPDX-License-Identifier: GPL-3.0-or-later
//
// This file is part of git-bug-rs/git-gub.
//
// You should have received a copy of the License along with this program.
// If not, see <https://www.gnu.org/licenses/agpl.txt>.

//! The core trait of git_bug-rs. All values, that we commit to storage are
//! [`Entities`][`Entity`], that is they implement the [`Entity`] trait.
//!
//! This trait implements reading (and in the future writing) the data structure
//! expected by `git-bug` in the commits, whilst delegating the encoding/decode
//! of operations to the implementer.

use std::{
    cmp::Ordering,
    collections::{HashMap, HashSet},
    fmt::Debug,
    fs,
    time::{Duration, SystemTime},
};

use gix::{ObjectId, Repository};
use serde::{Serialize, de::DeserializeOwned};

use self::{
    id::entity_id::EntityId,
    lamport::Clock,
    operation::{
        operation_data::OperationData, operation_pack::OperationPack, operations::Operations,
    },
    snapshot::{
        Snapshot,
        timeline::{Timeline, history_step::HistoryStep},
    },
};
use super::{Replica, cache::impl_cache};

pub mod id;
pub mod identity;
pub mod lamport;
pub mod operation;
pub mod snapshot;

pub mod nonce;
pub mod timestamp;

/// [`Entities`][`Entity`] are the “objects” that compose a
/// [`Replica`][`super::Replica`].
///
/// This trait provides the common functionality needed for each [`Entity`].
pub trait Entity: Sized + Debug + DeserializeOwned + Serialize {
    /// The name of the entity (issue, pull-request, ...), for human
    /// consumption.
    const TYPENAME: &str;

    /// The namespace in git references (bugs, prs, ...).
    const NAMESPACE: &str;

    /// The expected format version number, that can be used for data
    /// migration/upgrade.
    const FORMAT_VERSION: usize;

    /// The type of Operation this [`Entity`] uses.
    type OperationData: Debug + OperationData + Clone + Serialize + DeserializeOwned;

    /// A step in the history of a [`Snapshot's`][`snapshot::Snapshot`]
    /// [`Timeline`][`snapshot::timeline::Timeline`].
    type HistoryStep: Debug + HistoryStep;

    /// The complete timeline of an [`Snapshot`] of this [`Entity`].
    type Timeline: Debug + Timeline<Self>;

    /// Return this [`Entity`]'s [`EntityId`].
    fn id(&self) -> EntityId<Self>
    where
        Self: Sized,
    {
        self.operations().root().id()
    }

    /// Generate a snapshot of this [`Entity`]. This is a frozen collection of
    /// this [`Entity's`][`Entity`] [`Operations`][`operation::Operation`].
    ///
    /// # Note
    /// The generic [`Snapshot`][`snapshot::Snapshot`] structure is extended by
    /// both identity and issue [`Entities`][`Entity`] with getters for
    /// their specific values.
    #[must_use]
    fn snapshot(&self) -> Snapshot<Self> {
        let mut base = Snapshot::<Self>::from_root_operation(self.operations().root());

        // We skip the root operation (we already sort-of applied it on the creation of
        // base.).
        for operation in self.operations().iter().skip(1) {
            base.apply(operation);
        }

        base
    }

    /// Return the [`Operations`] that compose this [`Entity`].
    fn operations(&self) -> &Operations<Self>
    where
        Self: Sized;
    /// Return the [`lamport::Time`] that was set, when this [`Entity`] was
    /// first created.
    fn create_time(&self) -> &lamport::Time
    where
        Self: Sized;

    /// Return the [`lamport::Time`] that was set, when this [`Entity`] was last
    /// edited.
    fn edit_time(&self) -> &lamport::Time
    where
        Self: Sized;

    /// Return the [commit object id][`ObjectId`] of the last commit that added
    /// operations to this [`Entity`].
    fn current_head(&self) -> &gix::oid
    where
        Self: Sized;

    /// Construct this instance from the data stored on disk.
    ///
    /// This function cannot return an Error, because the previous decoding
    /// functions (e.g.,
    /// [`Operation::from_value`][`operation::Operation::from_value`]) should
    /// have already sorted out invalid values.
    ///
    /// # Note
    /// This function is probably not what you want. It is only useful, if your
    /// want to create this [`Entity`] from it's on disk serialization (or
    /// from a cache.)
    ///
    /// # Safety
    /// You need to uphold following invariants:
    /// - The `creation_time` and `edit_time` must be valid for the repository at this time (i.e.,
    ///   they should have been computed through witnessing the other clocks.)
    /// - The `current_head` must be the actual current head (i.e., `operations` must be computable
    ///   by starting a commit traversal from this point).
    unsafe fn from_parts(
        operations: Operations<Self>,
        create_time: lamport::Time,
        edit_time: lamport::Time,
        current_head: ObjectId,
    ) -> Self
    where
        Self: Sized;
}

#[allow(missing_docs)]
pub mod find {
    use crate::replica;

    #[derive(Debug, thiserror::Error)]
    pub enum Error {
        #[error("Expected to find a reference at {reference}, but found none, because {error}")]
        MissingReference {
            reference: String,
            error: gix::reference::find::existing::Error,
        },

        #[error(transparent)]
        CacheError(#[from] replica::cache::Error),
    }
}
#[allow(missing_docs)]
pub mod bfs {
    #[derive(Debug, thiserror::Error)]
    pub enum Error {
        #[error("Expected to find a git object with {id}, but found none, because {error}")]
        MissingObjectId {
            id: gix::ObjectId,
            error: gix::object::find::existing::Error,
        },
    }
}

/// Extension trait for [`Entities`][`Entity`].
///
/// These functions are responsible for reading and writing the [`Entity`] to
/// the git repository.
pub trait EntityRead: Entity {
    /// An error that can be used to add to the default Error in [`read`].
    ///
    /// Defining this is only really useful, if you override the default
    /// [`read`] method. Otherwise, you can set this to
    /// [`std::convert::Infallible`] (we would do this here, but
    /// default associated types are not stable yet).
    type CustomReadError: Debug + std::error::Error;

    /// Get the commit associated with the last entry of this
    /// [`Entity's`][`Entity`] [`Operations`][`operation::Operation`].
    ///
    /// Conceptually, this just resolves the git reference at
    /// `refs/Self::NAMESPACE/id`.
    ///
    /// # Errors
    /// If the reference could not be resolved (e.g., it was missing.)
    fn last_git_commit(replica: &Replica, id: EntityId<Self>) -> Result<ObjectId, find::Error> {
        // TODO(@bpeetz): This function should be fast, once git switches to their reftable impl by
        // default. But until then, we need to cache the lookups, because finding a ref can (e.g.,
        // if packed) end up with a linear search. <2025-05-27>

        let reference_path = id.to_ref_path();
        let newest_time = {
            let packed_refs_time = fs::metadata(replica.repo().refs.packed_refs_path())
                .map(|metadata| metadata.modified().unwrap_or(SystemTime::UNIX_EPOCH))
                .unwrap_or(SystemTime::UNIX_EPOCH);

            let refs_time = fs::metadata(replica.repo().refs.git_dir().join(&reference_path))
                .map(|metadata| metadata.modified().unwrap_or(SystemTime::UNIX_EPOCH))
                .unwrap_or(SystemTime::UNIX_EPOCH);

            let max = std::cmp::max(refs_time, packed_refs_time);

            max.duration_since(SystemTime::UNIX_EPOCH)
                .unwrap_or(Duration::from_secs(0))
        };

        let key = {
            let mut base = [0; 64];

            #[allow(clippy::needless_as_bytes)]
            {
                assert!(
                    newest_time.as_secs().to_be_bytes().len()
                        + Self::NAMESPACE.as_bytes().len()
                        + id.as_id().as_slice().len()
                        <= 64
                );
            }

            for (byte, slot) in newest_time
                .as_secs()
                .to_be_bytes()
                .iter()
                .chain(Self::NAMESPACE.as_bytes())
                .chain(id.as_id().as_slice())
                .zip(base.iter_mut())
            {
                *slot = *byte;
            }

            base
        };

        impl_cache!(@mk_table "reftable");

        impl_cache! {@lookup replica.db(), &key[..]};

        let me = replica
            .repo()
            .find_reference(&reference_path)
            .map_err(|err| find::Error::MissingReference {
                reference: reference_path,
                error: err,
            })?
            .target()
            .id()
            .to_owned();

        impl_cache! {@populate replica.db(), &key[..], &me};

        Ok(me)
    }

    /// A breadth-first search to get a topological order of the Operations DAG
    /// where we discover the parents commit and go back in time up to the
    /// chronological root
    ///
    /// # Errors
    /// If one of the git Ids has no commit object attached to it.
    fn breadth_first_search(
        repo: &Repository,
        root_id: ObjectId,
    ) -> Result<Vec<gix::Commit<'_>>, bfs::Error> {
        let mut queue: Vec<ObjectId> = Vec::with_capacity(32);
        let mut visited: HashSet<ObjectId> = HashSet::new();
        let mut bfs_order: Vec<gix::Commit<'_>> = Vec::with_capacity(32);

        queue.push(root_id);

        while let Some(current_id) = queue.pop() {
            let commit = {
                let object =
                    repo.find_object(current_id)
                        .map_err(|err| bfs::Error::MissingObjectId {
                            id: current_id,
                            error: err,
                        })?;
                object.into_commit()
            };

            for parent in commit.parent_ids() {
                if !visited.contains(&parent.detach()) {
                    queue.push(parent.detach());
                    visited.insert(parent.detach());
                }
            }

            bfs_order.push(commit);
        }

        Ok(bfs_order)
    }

    /// Fetch this [`Entity`] from a git repository and decode it.
    ///
    /// # Errors
    /// If the associated git operations fail.
    // TODO(@bpeetz): Split this function up <2025-04-16>
    #[allow(clippy::too_many_lines)]
    fn read(replica: &Replica, id: EntityId<Self>) -> Result<Self, read::Error<Self>>
    where
        Self: Sized,
    {
        impl_cache!(@mk_table "entities");

        let last_id = Self::last_git_commit(replica, id)?;

        let mut key = id.as_id().as_slice().to_owned();
        key.extend(last_id.as_slice());
        impl_cache! {@lookup replica.db, key.as_slice()}

        let (operation_packs, operation_count) = {
            // Now, we can reverse this topological order and read the commits in an order
            // where we are sure to have read all the chronological ancestors
            // when we read a commit.

            // Next step is to:
            // 1) read the operationPacks
            // 2) make sure that clocks causality respect the DAG topology.
            let bfs_order = Self::breadth_first_search(replica.repo(), last_id)?;

            let mut operation_packs: HashMap<ObjectId, OperationPack<Self>> = HashMap::new();
            let mut operation_count = 0;

            let mut is_first_commit = true;
            for commit in bfs_order.into_iter().rev() {
                let is_merge = commit.parent_ids().count() > 1;

                // Verify DAG structure has a single chronological root, so only the root
                // can have no parents. Said otherwise, the DAG needs to have exactly
                // one leaf.
                if !is_first_commit && commit.parent_ids().count() == 0 {
                    return Err(read::Error::MultipleLeafs);
                }

                let operation_pack =
                    OperationPack::<Self>::from_repository(&replica.db, &commit)
                        .map_err(|err| read::Error::MalformedOperationPack { error: err })?;

                if is_merge && !operation_pack.operations.is_empty() {
                    return Err(read::Error::MergeWithOperations);
                }

                if is_first_commit && operation_pack.create_time.is_none() {
                    return Err(read::Error::CreationLamportTimeUnset);
                }

                // Make sure that the lamport clocks causality match the DAG topology
                for parent_id in commit.parent_ids() {
                    // All of the parents should already be in the map, as we iter through the
                    // commits in reversed order and thus started with the root commit, without
                    // parents.
                    let Some(parent_operation_pack) = operation_packs.get(&parent_id.detach())
                    else {
                        return Err(read::Error::RootWithParent);
                    };

                    if parent_operation_pack.edit_time >= operation_pack.edit_time {
                        return Err(read::Error::ParentWithGreaterEditTime {
                            parent_time: parent_operation_pack.edit_time,
                            child_time: operation_pack.edit_time,
                        });
                    }

                    // To avoid an attack where clocks are pushed toward the u64 overflow point, we
                    // make sure that the clocks don't jump too far in the
                    // future. We ignore merge commits here to allow merging
                    // after a long time without breaking anything, as long as
                    // there is one valid chain of small hops, it's fine.
                    if !is_merge
                        && (operation_pack.edit_time.value()
                            - parent_operation_pack.edit_time.value())
                            > 1_000_000
                    {
                        panic!(
                            "Noticed lamport clock jumping too far in the future, this is likely \
                             an attack."
                        )
                    }

                    operation_count += operation_pack.operations.len();
                }
                assert!(operation_packs.insert(commit.id, operation_pack).is_none());
                is_first_commit = false;
            }

            (operation_packs, operation_count)
        };

        {
            // Now that we checked, that the clocks are fine, we can update this repo's
            // clocks.

            // TODO(@bpeetz): Why are we updating the repo's clocks if they are updated
            // either way on the next write? <2025-04-16>
            for operation_pack in operation_packs.values() {
                lamport::repository::get_or_init_clock(
                    replica.repo(),
                    format!("{}-create", Self::NAMESPACE).as_str(),
                )?
                .witness(operation_pack.create_time.unwrap_or(lamport::Time::from(0)))?;

                lamport::repository::get_or_init_clock(
                    replica.repo(),
                    format!("{}-edit", Self::NAMESPACE).as_str(),
                )?
                .witness(operation_pack.edit_time)?;
            }
        }

        let sorted_operation_packs = {
            // Now that we know that the topological order and clocks are fine, we order the
            // operation packs based on the logical clocks, entirely ignoring
            // the DAG topology.

            // Although we checked that every parent has a lower edit time that it's
            // children, we still need to sort the DAG, because multiple
            // children where not compared with each other.
            // Example:
            //      A
            //  /   |   \
            //  B   C   D
            //  \   |   /
            //      E
            // In this case, A < B ⋀ A < C ⋀ A < D is true, but we need a linear sorting
            // instead of this tree.

            let mut operation_packs: Vec<_> = operation_packs.into_values().collect();
            operation_packs.sort_unstable_by(|a, b| {
                let cmp = a.edit_time.cmp(&b.edit_time);

                // > We need to check for equal edit times, which would meant that we had
                // > concurrent edition over different machines, and we can't tell
                // > which one came first.
                //
                // > So, what now? We still need a total ordering and the most stable possible.
                // > As a secondary ordering, we can order based on a hash of
                // > the serialized Operations in the operation pack.
                // > It doesn't carry much meaning but it's unbiased and hard to abuse.
                // > This is a lexicographic ordering on the stringified ID.
                //
                // Git-bug's stores it's IDs as literal strings, as such we unfortunately
                // cannot use our IDs directly and need to first encode them as hex string, so
                // that we use the same sorting order as git-bug.

                if cmp == Ordering::Equal {
                    return a.id().to_string().cmp(&b.id().to_string());
                }

                cmp
            });

            operation_packs
        };

        let (operations, create_time, edit_time) = {
            // Now we can unpack the operation packs.

            let mut operations = Vec::with_capacity(operation_count);
            let mut create_time = lamport::Time::from(1);
            let mut edit_time = lamport::Time::from(1);

            for pack in sorted_operation_packs {
                operations.extend(pack.operations);

                if pack.create_time > Some(create_time) {
                    create_time = pack.create_time.expect("Is some");
                }
                if pack.edit_time > edit_time {
                    edit_time = pack.edit_time;
                }
            }

            (
                Operations::from_operations(operations)?,
                create_time,
                edit_time,
            )
        };

        // Safety:
        // - We check everything that git-bug also checks, so this should be fine?
        let me = unsafe { Self::from_parts(operations, create_time, edit_time, last_id) };

        impl_cache! {@populate replica.db, key.as_slice(), &me}

        assert_eq!(me.id(), id, "We should be able to find the correct id");

        Ok(me)
    }
}

/// [`Entity`] read errors.
pub mod read {
    use super::{
        Entity, EntityRead, lamport,
        operation::{operation_pack, operations},
    };
    use crate::replica;

    #[allow(missing_docs)]
    #[derive(Debug, thiserror::Error)]
    /// The error returned by [`EntityRead::read`].
    pub enum Error<E: Entity + EntityRead> {
        #[error(transparent)]
        ReferenceResolve(#[from] super::find::Error),
        #[error(transparent)]
        BreadthFirstSearch(#[from] super::bfs::Error),

        #[error("The operations composing this entity are invalid, because {0}")]
        WrongOperationsSequence(#[from] operations::create::Error<E>),

        // TODO(@bpeetz): Remove all these validation errors, with a type system that makes it
        // impossible to parse them (i.e, “Parse don't validate”). <2025-04-15>
        #[error("Found a merge commit with operations in it.")]
        MergeWithOperations,
        #[error("Multiple leafs in the entity DAG")]
        MultipleLeafs,
        #[error("The root commit missed it's 'create-clock-' tree entry")]
        CreationLamportTimeUnset,

        #[error("The root commit seems to have parents?")]
        RootWithParent,

        #[error("Expected to find a correct operation pack, but found none, because {error}")]
        MalformedOperationPack {
            error: operation_pack::decode::Error,
        },

        #[error("A parent commit had a greater or equal edit lamport time to it's child.'")]
        ParentWithGreaterEditTime {
            parent_time: lamport::Time,
            child_time: lamport::Time,
        },

        #[error(transparent)]
        ClockRead(#[from] lamport::repository::get::Error),
        #[error(transparent)]
        ClockWitness(#[from] lamport::persistent::write::Error),

        #[error(transparent)]
        CustomRead(<E as EntityRead>::CustomReadError),

        #[error(transparent)]
        CacheError(#[from] replica::cache::Error),
    }

    // TODO(@bpeetz): This doesn't work, because rust thinks that it is the same
    // as core's `impl<T> From<T> for T`
    // It is not completely different, but more specific.
    // As people will just have to implement this themselves for their specific
    // error, without generics. <2025-04-21>
    // ```rust
    // impl<E: Entity + EntityRead> From<<E as EntityRead>::CustomReadError> for Error<E> {
    //     fn from(value: <E as EntityRead>::CustomReadError) -> Self {
    //         Self::CustomRead(value)
    //     }
    // }
    // ```
}