git_bug/replica/entity/
mod.rs

1// git-bug-rs - A rust library for interfacing with git-bug repositories
2//
3// Copyright (C) 2025 Benedikt Peetz <benedikt.peetz@b-peetz.de>
4// SPDX-License-Identifier: GPL-3.0-or-later
5//
6// This file is part of git-bug-rs/git-gub.
7//
8// You should have received a copy of the License along with this program.
9// If not, see <https://www.gnu.org/licenses/agpl.txt>.
10
11//! The core trait of git_bug-rs. All values, that we commit to storage are
12//! [`Entities`][`Entity`], that is they implement the [`Entity`] trait.
13//!
14//! This trait implements reading (and in the future writing) the data structure
15//! expected by `git-bug` in the commits, whilst delegating the encoding/decode
16//! of operations to the implementer.
17
18use std::{
19    cmp::Ordering,
20    collections::{HashMap, HashSet},
21    fmt::Debug,
22    fs,
23    time::{Duration, SystemTime},
24};
25
26use gix::{ObjectId, Repository};
27use serde::{Serialize, de::DeserializeOwned};
28
29use self::{
30    id::entity_id::EntityId,
31    lamport::Clock,
32    operation::{
33        operation_data::OperationData, operation_pack::OperationPack, operations::Operations,
34    },
35    snapshot::{
36        Snapshot,
37        timeline::{Timeline, history_step::HistoryStep},
38    },
39};
40use super::{Replica, cache::impl_cache};
41
42pub mod id;
43pub mod identity;
44pub mod lamport;
45pub mod operation;
46pub mod snapshot;
47
48pub mod nonce;
49pub mod timestamp;
50
51/// [`Entities`][`Entity`] are the “objects” that compose a
52/// [`Replica`][`super::Replica`].
53///
54/// This trait provides the common functionality needed for each [`Entity`].
55pub trait Entity: Sized + Debug + DeserializeOwned + Serialize {
56    /// The name of the entity (issue, pull-request, ...), for human
57    /// consumption.
58    const TYPENAME: &str;
59
60    /// The namespace in git references (bugs, prs, ...).
61    const NAMESPACE: &str;
62
63    /// The expected format version number, that can be used for data
64    /// migration/upgrade.
65    const FORMAT_VERSION: usize;
66
67    /// The type of Operation this [`Entity`] uses.
68    type OperationData: Debug + OperationData + Clone + Serialize + DeserializeOwned;
69
70    /// A step in the history of a [`Snapshot's`][`snapshot::Snapshot`]
71    /// [`Timeline`][`snapshot::timeline::Timeline`].
72    type HistoryStep: Debug + HistoryStep;
73
74    /// The complete timeline of an [`Snapshot`] of this [`Entity`].
75    type Timeline: Debug + Timeline<Self>;
76
77    /// Return this [`Entity`]'s [`EntityId`].
78    fn id(&self) -> EntityId<Self>
79    where
80        Self: Sized,
81    {
82        self.operations().root().id()
83    }
84
85    /// Generate a snapshot of this [`Entity`]. This is a frozen collection of
86    /// this [`Entity's`][`Entity`] [`Operations`][`operation::Operation`].
87    ///
88    /// # Note
89    /// The generic [`Snapshot`][`snapshot::Snapshot`] structure is extended by
90    /// both identity and issue [`Entities`][`Entity`] with getters for
91    /// their specific values.
92    #[must_use]
93    fn snapshot(&self) -> Snapshot<Self> {
94        let mut base = Snapshot::<Self>::from_root_operation(self.operations().root());
95
96        // We skip the root operation (we already sort-of applied it on the creation of
97        // base.).
98        for operation in self.operations().iter().skip(1) {
99            base.apply(operation);
100        }
101
102        base
103    }
104
105    /// Return the [`Operations`] that compose this [`Entity`].
106    fn operations(&self) -> &Operations<Self>
107    where
108        Self: Sized;
109    /// Return the [`lamport::Time`] that was set, when this [`Entity`] was
110    /// first created.
111    fn create_time(&self) -> &lamport::Time
112    where
113        Self: Sized;
114
115    /// Return the [`lamport::Time`] that was set, when this [`Entity`] was last
116    /// edited.
117    fn edit_time(&self) -> &lamport::Time
118    where
119        Self: Sized;
120
121    /// Return the [commit object id][`ObjectId`] of the last commit that added
122    /// operations to this [`Entity`].
123    fn current_head(&self) -> &gix::oid
124    where
125        Self: Sized;
126
127    /// Construct this instance from the data stored on disk.
128    ///
129    /// This function cannot return an Error, because the previous decoding
130    /// functions (e.g.,
131    /// [`Operation::from_value`][`operation::Operation::from_value`]) should
132    /// have already sorted out invalid values.
133    ///
134    /// # Note
135    /// This function is probably not what you want. It is only useful, if your
136    /// want to create this [`Entity`] from it's on disk serialization (or
137    /// from a cache.)
138    ///
139    /// # Safety
140    /// You need to uphold following invariants:
141    /// - The `creation_time` and `edit_time` must be valid for the repository at this time (i.e.,
142    ///   they should have been computed through witnessing the other clocks.)
143    /// - The `current_head` must be the actual current head (i.e., `operations` must be computable
144    ///   by starting a commit traversal from this point).
145    unsafe fn from_parts(
146        operations: Operations<Self>,
147        create_time: lamport::Time,
148        edit_time: lamport::Time,
149        current_head: ObjectId,
150    ) -> Self
151    where
152        Self: Sized;
153}
154
155#[allow(missing_docs)]
156pub mod find {
157    use crate::replica;
158
159    #[derive(Debug, thiserror::Error)]
160    pub enum Error {
161        #[error("Expected to find a reference at {reference}, but found none, because {error}")]
162        MissingReference {
163            reference: String,
164            error: gix::reference::find::existing::Error,
165        },
166
167        #[error(transparent)]
168        CacheError(#[from] replica::cache::Error),
169    }
170}
171#[allow(missing_docs)]
172pub mod bfs {
173    #[derive(Debug, thiserror::Error)]
174    pub enum Error {
175        #[error("Expected to find a git object with {id}, but found none, because {error}")]
176        MissingObjectId {
177            id: gix::ObjectId,
178            error: gix::object::find::existing::Error,
179        },
180    }
181}
182
183/// Extension trait for [`Entities`][`Entity`].
184///
185/// These functions are responsible for reading and writing the [`Entity`] to
186/// the git repository.
187pub trait EntityRead: Entity {
188    /// An error that can be used to add to the default Error in [`read`].
189    ///
190    /// Defining this is only really useful, if you override the default
191    /// [`read`] method. Otherwise, you can set this to
192    /// [`std::convert::Infallible`] (we would do this here, but
193    /// default associated types are not stable yet).
194    type CustomReadError: Debug + std::error::Error;
195
196    /// Get the commit associated with the last entry of this
197    /// [`Entity's`][`Entity`] [`Operations`][`operation::Operation`].
198    ///
199    /// Conceptually, this just resolves the git reference at
200    /// `refs/Self::NAMESPACE/id`.
201    ///
202    /// # Errors
203    /// If the reference could not be resolved (e.g., it was missing.)
204    fn last_git_commit(replica: &Replica, id: EntityId<Self>) -> Result<ObjectId, find::Error> {
205        // TODO(@bpeetz): This function should be fast, once git switches to their reftable impl by
206        // default. But until then, we need to cache the lookups, because finding a ref can (e.g.,
207        // if packed) end up with a linear search. <2025-05-27>
208
209        let reference_path = id.to_ref_path();
210        let newest_time = {
211            let packed_refs_time = fs::metadata(replica.repo().refs.packed_refs_path())
212                .map(|metadata| metadata.modified().unwrap_or(SystemTime::UNIX_EPOCH))
213                .unwrap_or(SystemTime::UNIX_EPOCH);
214
215            let refs_time = fs::metadata(replica.repo().refs.git_dir().join(&reference_path))
216                .map(|metadata| metadata.modified().unwrap_or(SystemTime::UNIX_EPOCH))
217                .unwrap_or(SystemTime::UNIX_EPOCH);
218
219            let max = std::cmp::max(refs_time, packed_refs_time);
220
221            max.duration_since(SystemTime::UNIX_EPOCH)
222                .unwrap_or(Duration::from_secs(0))
223        };
224
225        let key = {
226            let mut base = [0; 64];
227
228            #[allow(clippy::needless_as_bytes)]
229            {
230                assert!(
231                    newest_time.as_secs().to_be_bytes().len()
232                        + Self::NAMESPACE.as_bytes().len()
233                        + id.as_id().as_slice().len()
234                        <= 64
235                );
236            }
237
238            for (byte, slot) in newest_time
239                .as_secs()
240                .to_be_bytes()
241                .iter()
242                .chain(Self::NAMESPACE.as_bytes())
243                .chain(id.as_id().as_slice())
244                .zip(base.iter_mut())
245            {
246                *slot = *byte;
247            }
248
249            base
250        };
251
252        impl_cache!(@mk_table "reftable");
253
254        impl_cache! {@lookup replica.db(), &key[..]};
255
256        let me = replica
257            .repo()
258            .find_reference(&reference_path)
259            .map_err(|err| find::Error::MissingReference {
260                reference: reference_path,
261                error: err,
262            })?
263            .target()
264            .id()
265            .to_owned();
266
267        impl_cache! {@populate replica.db(), &key[..], &me};
268
269        Ok(me)
270    }
271
272    /// A breadth-first search to get a topological order of the Operations DAG
273    /// where we discover the parents commit and go back in time up to the
274    /// chronological root
275    ///
276    /// # Errors
277    /// If one of the git Ids has no commit object attached to it.
278    fn breadth_first_search(
279        repo: &Repository,
280        root_id: ObjectId,
281    ) -> Result<Vec<gix::Commit<'_>>, bfs::Error> {
282        let mut queue: Vec<ObjectId> = Vec::with_capacity(32);
283        let mut visited: HashSet<ObjectId> = HashSet::new();
284        let mut bfs_order: Vec<gix::Commit<'_>> = Vec::with_capacity(32);
285
286        queue.push(root_id);
287
288        while let Some(current_id) = queue.pop() {
289            let commit = {
290                let object =
291                    repo.find_object(current_id)
292                        .map_err(|err| bfs::Error::MissingObjectId {
293                            id: current_id,
294                            error: err,
295                        })?;
296                object.into_commit()
297            };
298
299            for parent in commit.parent_ids() {
300                if !visited.contains(&parent.detach()) {
301                    queue.push(parent.detach());
302                    visited.insert(parent.detach());
303                }
304            }
305
306            bfs_order.push(commit);
307        }
308
309        Ok(bfs_order)
310    }
311
312    /// Fetch this [`Entity`] from a git repository and decode it.
313    ///
314    /// # Errors
315    /// If the associated git operations fail.
316    // TODO(@bpeetz): Split this function up <2025-04-16>
317    #[allow(clippy::too_many_lines)]
318    fn read(replica: &Replica, id: EntityId<Self>) -> Result<Self, read::Error<Self>>
319    where
320        Self: Sized,
321    {
322        impl_cache!(@mk_table "entities");
323
324        let last_id = Self::last_git_commit(replica, id)?;
325
326        let mut key = id.as_id().as_slice().to_owned();
327        key.extend(last_id.as_slice());
328        impl_cache! {@lookup replica.db, key.as_slice()}
329
330        let (operation_packs, operation_count) = {
331            // Now, we can reverse this topological order and read the commits in an order
332            // where we are sure to have read all the chronological ancestors
333            // when we read a commit.
334
335            // Next step is to:
336            // 1) read the operationPacks
337            // 2) make sure that clocks causality respect the DAG topology.
338            let bfs_order = Self::breadth_first_search(replica.repo(), last_id)?;
339
340            let mut operation_packs: HashMap<ObjectId, OperationPack<Self>> = HashMap::new();
341            let mut operation_count = 0;
342
343            let mut is_first_commit = true;
344            for commit in bfs_order.into_iter().rev() {
345                let is_merge = commit.parent_ids().count() > 1;
346
347                // Verify DAG structure has a single chronological root, so only the root
348                // can have no parents. Said otherwise, the DAG needs to have exactly
349                // one leaf.
350                if !is_first_commit && commit.parent_ids().count() == 0 {
351                    return Err(read::Error::MultipleLeafs);
352                }
353
354                let operation_pack =
355                    OperationPack::<Self>::from_repository(&replica.db, &commit)
356                        .map_err(|err| read::Error::MalformedOperationPack { error: err })?;
357
358                if is_merge && !operation_pack.operations.is_empty() {
359                    return Err(read::Error::MergeWithOperations);
360                }
361
362                if is_first_commit && operation_pack.create_time.is_none() {
363                    return Err(read::Error::CreationLamportTimeUnset);
364                }
365
366                // Make sure that the lamport clocks causality match the DAG topology
367                for parent_id in commit.parent_ids() {
368                    // All of the parents should already be in the map, as we iter through the
369                    // commits in reversed order and thus started with the root commit, without
370                    // parents.
371                    let Some(parent_operation_pack) = operation_packs.get(&parent_id.detach())
372                    else {
373                        return Err(read::Error::RootWithParent);
374                    };
375
376                    if parent_operation_pack.edit_time >= operation_pack.edit_time {
377                        return Err(read::Error::ParentWithGreaterEditTime {
378                            parent_time: parent_operation_pack.edit_time,
379                            child_time: operation_pack.edit_time,
380                        });
381                    }
382
383                    // To avoid an attack where clocks are pushed toward the u64 overflow point, we
384                    // make sure that the clocks don't jump too far in the
385                    // future. We ignore merge commits here to allow merging
386                    // after a long time without breaking anything, as long as
387                    // there is one valid chain of small hops, it's fine.
388                    if !is_merge
389                        && (operation_pack.edit_time.value()
390                            - parent_operation_pack.edit_time.value())
391                            > 1_000_000
392                    {
393                        panic!(
394                            "Noticed lamport clock jumping too far in the future, this is likely \
395                             an attack."
396                        )
397                    }
398
399                    operation_count += operation_pack.operations.len();
400                }
401                assert!(operation_packs.insert(commit.id, operation_pack).is_none());
402                is_first_commit = false;
403            }
404
405            (operation_packs, operation_count)
406        };
407
408        {
409            // Now that we checked, that the clocks are fine, we can update this repo's
410            // clocks.
411
412            // TODO(@bpeetz): Why are we updating the repo's clocks if they are updated
413            // either way on the next write? <2025-04-16>
414            for operation_pack in operation_packs.values() {
415                lamport::repository::get_or_init_clock(
416                    replica.repo(),
417                    format!("{}-create", Self::NAMESPACE).as_str(),
418                )?
419                .witness(operation_pack.create_time.unwrap_or(lamport::Time::from(0)))?;
420
421                lamport::repository::get_or_init_clock(
422                    replica.repo(),
423                    format!("{}-edit", Self::NAMESPACE).as_str(),
424                )?
425                .witness(operation_pack.edit_time)?;
426            }
427        }
428
429        let sorted_operation_packs = {
430            // Now that we know that the topological order and clocks are fine, we order the
431            // operation packs based on the logical clocks, entirely ignoring
432            // the DAG topology.
433
434            // Although we checked that every parent has a lower edit time that it's
435            // children, we still need to sort the DAG, because multiple
436            // children where not compared with each other.
437            // Example:
438            //      A
439            //  /   |   \
440            //  B   C   D
441            //  \   |   /
442            //      E
443            // In this case, A < B ⋀ A < C ⋀ A < D is true, but we need a linear sorting
444            // instead of this tree.
445
446            let mut operation_packs: Vec<_> = operation_packs.into_values().collect();
447            operation_packs.sort_unstable_by(|a, b| {
448                let cmp = a.edit_time.cmp(&b.edit_time);
449
450                // > We need to check for equal edit times, which would meant that we had
451                // > concurrent edition over different machines, and we can't tell
452                // > which one came first.
453                //
454                // > So, what now? We still need a total ordering and the most stable possible.
455                // > As a secondary ordering, we can order based on a hash of
456                // > the serialized Operations in the operation pack.
457                // > It doesn't carry much meaning but it's unbiased and hard to abuse.
458                // > This is a lexicographic ordering on the stringified ID.
459                //
460                // Git-bug's stores it's IDs as literal strings, as such we unfortunately
461                // cannot use our IDs directly and need to first encode them as hex string, so
462                // that we use the same sorting order as git-bug.
463
464                if cmp == Ordering::Equal {
465                    return a.id().to_string().cmp(&b.id().to_string());
466                }
467
468                cmp
469            });
470
471            operation_packs
472        };
473
474        let (operations, create_time, edit_time) = {
475            // Now we can unpack the operation packs.
476
477            let mut operations = Vec::with_capacity(operation_count);
478            let mut create_time = lamport::Time::from(1);
479            let mut edit_time = lamport::Time::from(1);
480
481            for pack in sorted_operation_packs {
482                operations.extend(pack.operations);
483
484                if pack.create_time > Some(create_time) {
485                    create_time = pack.create_time.expect("Is some");
486                }
487                if pack.edit_time > edit_time {
488                    edit_time = pack.edit_time;
489                }
490            }
491
492            (
493                Operations::from_operations(operations)?,
494                create_time,
495                edit_time,
496            )
497        };
498
499        // Safety:
500        // - We check everything that git-bug also checks, so this should be fine?
501        let me = unsafe { Self::from_parts(operations, create_time, edit_time, last_id) };
502
503        impl_cache! {@populate replica.db, key.as_slice(), &me}
504
505        assert_eq!(me.id(), id, "We should be able to find the correct id");
506
507        Ok(me)
508    }
509}
510
511/// [`Entity`] read errors.
512pub mod read {
513    use super::{
514        Entity, EntityRead, lamport,
515        operation::{operation_pack, operations},
516    };
517    use crate::replica;
518
519    #[allow(missing_docs)]
520    #[derive(Debug, thiserror::Error)]
521    /// The error returned by [`EntityRead::read`].
522    pub enum Error<E: Entity + EntityRead> {
523        #[error(transparent)]
524        ReferenceResolve(#[from] super::find::Error),
525        #[error(transparent)]
526        BreadthFirstSearch(#[from] super::bfs::Error),
527
528        #[error("The operations composing this entity are invalid, because {0}")]
529        WrongOperationsSequence(#[from] operations::create::Error<E>),
530
531        // TODO(@bpeetz): Remove all these validation errors, with a type system that makes it
532        // impossible to parse them (i.e, “Parse don't validate”). <2025-04-15>
533        #[error("Found a merge commit with operations in it.")]
534        MergeWithOperations,
535        #[error("Multiple leafs in the entity DAG")]
536        MultipleLeafs,
537        #[error("The root commit missed it's 'create-clock-' tree entry")]
538        CreationLamportTimeUnset,
539
540        #[error("The root commit seems to have parents?")]
541        RootWithParent,
542
543        #[error("Expected to find a correct operation pack, but found none, because {error}")]
544        MalformedOperationPack {
545            error: operation_pack::decode::Error,
546        },
547
548        #[error("A parent commit had a greater or equal edit lamport time to it's child.'")]
549        ParentWithGreaterEditTime {
550            parent_time: lamport::Time,
551            child_time: lamport::Time,
552        },
553
554        #[error(transparent)]
555        ClockRead(#[from] lamport::repository::get::Error),
556        #[error(transparent)]
557        ClockWitness(#[from] lamport::persistent::write::Error),
558
559        #[error(transparent)]
560        CustomRead(<E as EntityRead>::CustomReadError),
561
562        #[error(transparent)]
563        CacheError(#[from] replica::cache::Error),
564    }
565
566    // TODO(@bpeetz): This doesn't work, because rust thinks that it is the same
567    // as core's `impl<T> From<T> for T`
568    // It is not completely different, but more specific.
569    // As people will just have to implement this themselves for their specific
570    // error, without generics. <2025-04-21>
571    // ```rust
572    // impl<E: Entity + EntityRead> From<<E as EntityRead>::CustomReadError> for Error<E> {
573    //     fn from(value: <E as EntityRead>::CustomReadError) -> Self {
574    //         Self::CustomRead(value)
575    //     }
576    // }
577    // ```
578}