git_bug/replica/entity/mod.rs
1// git-bug-rs - A rust library for interfacing with git-bug repositories
2//
3// Copyright (C) 2025 Benedikt Peetz <benedikt.peetz@b-peetz.de>
4// SPDX-License-Identifier: GPL-3.0-or-later
5//
6// This file is part of git-bug-rs/git-gub.
7//
8// You should have received a copy of the License along with this program.
9// If not, see <https://www.gnu.org/licenses/agpl.txt>.
10
11//! The core trait of git_bug-rs. All values, that we commit to storage are
12//! [`Entities`][`Entity`], that is they implement the [`Entity`] trait.
13//!
14//! This trait implements reading (and in the future writing) the data structure
15//! expected by `git-bug` in the commits, whilst delegating the encoding/decode
16//! of operations to the implementer.
17
18use std::{
19 cmp::Ordering,
20 collections::{HashMap, HashSet},
21 fmt::Debug,
22 fs,
23 time::{Duration, SystemTime},
24};
25
26use gix::{ObjectId, Repository};
27use serde::{Serialize, de::DeserializeOwned};
28
29use self::{
30 id::entity_id::EntityId,
31 lamport::Clock,
32 operation::{
33 operation_data::OperationData, operation_pack::OperationPack, operations::Operations,
34 },
35 snapshot::{
36 Snapshot,
37 timeline::{Timeline, history_step::HistoryStep},
38 },
39};
40use super::{Replica, cache::impl_cache};
41
42pub mod id;
43pub mod identity;
44pub mod lamport;
45pub mod operation;
46pub mod snapshot;
47
48pub mod nonce;
49pub mod timestamp;
50
51/// [`Entities`][`Entity`] are the “objects” that compose a
52/// [`Replica`][`super::Replica`].
53///
54/// This trait provides the common functionality needed for each [`Entity`].
55pub trait Entity: Sized + Debug + DeserializeOwned + Serialize {
56 /// The name of the entity (issue, pull-request, ...), for human
57 /// consumption.
58 const TYPENAME: &str;
59
60 /// The namespace in git references (bugs, prs, ...).
61 const NAMESPACE: &str;
62
63 /// The expected format version number, that can be used for data
64 /// migration/upgrade.
65 const FORMAT_VERSION: usize;
66
67 /// The type of Operation this [`Entity`] uses.
68 type OperationData: Debug + OperationData + Clone + Serialize + DeserializeOwned;
69
70 /// A step in the history of a [`Snapshot's`][`snapshot::Snapshot`]
71 /// [`Timeline`][`snapshot::timeline::Timeline`].
72 type HistoryStep: Debug + HistoryStep;
73
74 /// The complete timeline of an [`Snapshot`] of this [`Entity`].
75 type Timeline: Debug + Timeline<Self>;
76
77 /// Return this [`Entity`]'s [`EntityId`].
78 fn id(&self) -> EntityId<Self>
79 where
80 Self: Sized,
81 {
82 self.operations().root().id()
83 }
84
85 /// Generate a snapshot of this [`Entity`]. This is a frozen collection of
86 /// this [`Entity's`][`Entity`] [`Operations`][`operation::Operation`].
87 ///
88 /// # Note
89 /// The generic [`Snapshot`][`snapshot::Snapshot`] structure is extended by
90 /// both identity and issue [`Entities`][`Entity`] with getters for
91 /// their specific values.
92 #[must_use]
93 fn snapshot(&self) -> Snapshot<Self> {
94 let mut base = Snapshot::<Self>::from_root_operation(self.operations().root());
95
96 // We skip the root operation (we already sort-of applied it on the creation of
97 // base.).
98 for operation in self.operations().iter().skip(1) {
99 base.apply(operation);
100 }
101
102 base
103 }
104
105 /// Return the [`Operations`] that compose this [`Entity`].
106 fn operations(&self) -> &Operations<Self>
107 where
108 Self: Sized;
109 /// Return the [`lamport::Time`] that was set, when this [`Entity`] was
110 /// first created.
111 fn create_time(&self) -> &lamport::Time
112 where
113 Self: Sized;
114
115 /// Return the [`lamport::Time`] that was set, when this [`Entity`] was last
116 /// edited.
117 fn edit_time(&self) -> &lamport::Time
118 where
119 Self: Sized;
120
121 /// Return the [commit object id][`ObjectId`] of the last commit that added
122 /// operations to this [`Entity`].
123 fn current_head(&self) -> &gix::oid
124 where
125 Self: Sized;
126
127 /// Construct this instance from the data stored on disk.
128 ///
129 /// This function cannot return an Error, because the previous decoding
130 /// functions (e.g.,
131 /// [`Operation::from_value`][`operation::Operation::from_value`]) should
132 /// have already sorted out invalid values.
133 ///
134 /// # Note
135 /// This function is probably not what you want. It is only useful, if your
136 /// want to create this [`Entity`] from it's on disk serialization (or
137 /// from a cache.)
138 ///
139 /// # Safety
140 /// You need to uphold following invariants:
141 /// - The `creation_time` and `edit_time` must be valid for the repository at this time (i.e.,
142 /// they should have been computed through witnessing the other clocks.)
143 /// - The `current_head` must be the actual current head (i.e., `operations` must be computable
144 /// by starting a commit traversal from this point).
145 unsafe fn from_parts(
146 operations: Operations<Self>,
147 create_time: lamport::Time,
148 edit_time: lamport::Time,
149 current_head: ObjectId,
150 ) -> Self
151 where
152 Self: Sized;
153}
154
155#[allow(missing_docs)]
156pub mod find {
157 use crate::replica;
158
159 #[derive(Debug, thiserror::Error)]
160 pub enum Error {
161 #[error("Expected to find a reference at {reference}, but found none, because {error}")]
162 MissingReference {
163 reference: String,
164 error: gix::reference::find::existing::Error,
165 },
166
167 #[error(transparent)]
168 CacheError(#[from] replica::cache::Error),
169 }
170}
171#[allow(missing_docs)]
172pub mod bfs {
173 #[derive(Debug, thiserror::Error)]
174 pub enum Error {
175 #[error("Expected to find a git object with {id}, but found none, because {error}")]
176 MissingObjectId {
177 id: gix::ObjectId,
178 error: gix::object::find::existing::Error,
179 },
180 }
181}
182
183/// Extension trait for [`Entities`][`Entity`].
184///
185/// These functions are responsible for reading and writing the [`Entity`] to
186/// the git repository.
187pub trait EntityRead: Entity {
188 /// An error that can be used to add to the default Error in [`read`].
189 ///
190 /// Defining this is only really useful, if you override the default
191 /// [`read`] method. Otherwise, you can set this to
192 /// [`std::convert::Infallible`] (we would do this here, but
193 /// default associated types are not stable yet).
194 type CustomReadError: Debug + std::error::Error;
195
196 /// Get the commit associated with the last entry of this
197 /// [`Entity's`][`Entity`] [`Operations`][`operation::Operation`].
198 ///
199 /// Conceptually, this just resolves the git reference at
200 /// `refs/Self::NAMESPACE/id`.
201 ///
202 /// # Errors
203 /// If the reference could not be resolved (e.g., it was missing.)
204 fn last_git_commit(replica: &Replica, id: EntityId<Self>) -> Result<ObjectId, find::Error> {
205 // TODO(@bpeetz): This function should be fast, once git switches to their reftable impl by
206 // default. But until then, we need to cache the lookups, because finding a ref can (e.g.,
207 // if packed) end up with a linear search. <2025-05-27>
208
209 let reference_path = id.to_ref_path();
210 let newest_time = {
211 let packed_refs_time = fs::metadata(replica.repo().refs.packed_refs_path())
212 .map(|metadata| metadata.modified().unwrap_or(SystemTime::UNIX_EPOCH))
213 .unwrap_or(SystemTime::UNIX_EPOCH);
214
215 let refs_time = fs::metadata(replica.repo().refs.git_dir().join(&reference_path))
216 .map(|metadata| metadata.modified().unwrap_or(SystemTime::UNIX_EPOCH))
217 .unwrap_or(SystemTime::UNIX_EPOCH);
218
219 let max = std::cmp::max(refs_time, packed_refs_time);
220
221 max.duration_since(SystemTime::UNIX_EPOCH)
222 .unwrap_or(Duration::from_secs(0))
223 };
224
225 let key = {
226 let mut base = [0; 64];
227
228 #[allow(clippy::needless_as_bytes)]
229 {
230 assert!(
231 newest_time.as_secs().to_be_bytes().len()
232 + Self::NAMESPACE.as_bytes().len()
233 + id.as_id().as_slice().len()
234 <= 64
235 );
236 }
237
238 for (byte, slot) in newest_time
239 .as_secs()
240 .to_be_bytes()
241 .iter()
242 .chain(Self::NAMESPACE.as_bytes())
243 .chain(id.as_id().as_slice())
244 .zip(base.iter_mut())
245 {
246 *slot = *byte;
247 }
248
249 base
250 };
251
252 impl_cache!(@mk_table "reftable");
253
254 impl_cache! {@lookup replica.db(), &key[..]};
255
256 let me = replica
257 .repo()
258 .find_reference(&reference_path)
259 .map_err(|err| find::Error::MissingReference {
260 reference: reference_path,
261 error: err,
262 })?
263 .target()
264 .id()
265 .to_owned();
266
267 impl_cache! {@populate replica.db(), &key[..], &me};
268
269 Ok(me)
270 }
271
272 /// A breadth-first search to get a topological order of the Operations DAG
273 /// where we discover the parents commit and go back in time up to the
274 /// chronological root
275 ///
276 /// # Errors
277 /// If one of the git Ids has no commit object attached to it.
278 fn breadth_first_search(
279 repo: &Repository,
280 root_id: ObjectId,
281 ) -> Result<Vec<gix::Commit<'_>>, bfs::Error> {
282 let mut queue: Vec<ObjectId> = Vec::with_capacity(32);
283 let mut visited: HashSet<ObjectId> = HashSet::new();
284 let mut bfs_order: Vec<gix::Commit<'_>> = Vec::with_capacity(32);
285
286 queue.push(root_id);
287
288 while let Some(current_id) = queue.pop() {
289 let commit = {
290 let object =
291 repo.find_object(current_id)
292 .map_err(|err| bfs::Error::MissingObjectId {
293 id: current_id,
294 error: err,
295 })?;
296 object.into_commit()
297 };
298
299 for parent in commit.parent_ids() {
300 if !visited.contains(&parent.detach()) {
301 queue.push(parent.detach());
302 visited.insert(parent.detach());
303 }
304 }
305
306 bfs_order.push(commit);
307 }
308
309 Ok(bfs_order)
310 }
311
312 /// Fetch this [`Entity`] from a git repository and decode it.
313 ///
314 /// # Errors
315 /// If the associated git operations fail.
316 // TODO(@bpeetz): Split this function up <2025-04-16>
317 #[allow(clippy::too_many_lines)]
318 fn read(replica: &Replica, id: EntityId<Self>) -> Result<Self, read::Error<Self>>
319 where
320 Self: Sized,
321 {
322 impl_cache!(@mk_table "entities");
323
324 let last_id = Self::last_git_commit(replica, id)?;
325
326 let mut key = id.as_id().as_slice().to_owned();
327 key.extend(last_id.as_slice());
328 impl_cache! {@lookup replica.db, key.as_slice()}
329
330 let (operation_packs, operation_count) = {
331 // Now, we can reverse this topological order and read the commits in an order
332 // where we are sure to have read all the chronological ancestors
333 // when we read a commit.
334
335 // Next step is to:
336 // 1) read the operationPacks
337 // 2) make sure that clocks causality respect the DAG topology.
338 let bfs_order = Self::breadth_first_search(replica.repo(), last_id)?;
339
340 let mut operation_packs: HashMap<ObjectId, OperationPack<Self>> = HashMap::new();
341 let mut operation_count = 0;
342
343 let mut is_first_commit = true;
344 for commit in bfs_order.into_iter().rev() {
345 let is_merge = commit.parent_ids().count() > 1;
346
347 // Verify DAG structure has a single chronological root, so only the root
348 // can have no parents. Said otherwise, the DAG needs to have exactly
349 // one leaf.
350 if !is_first_commit && commit.parent_ids().count() == 0 {
351 return Err(read::Error::MultipleLeafs);
352 }
353
354 let operation_pack =
355 OperationPack::<Self>::from_repository(&replica.db, &commit)
356 .map_err(|err| read::Error::MalformedOperationPack { error: err })?;
357
358 if is_merge && !operation_pack.operations.is_empty() {
359 return Err(read::Error::MergeWithOperations);
360 }
361
362 if is_first_commit && operation_pack.create_time.is_none() {
363 return Err(read::Error::CreationLamportTimeUnset);
364 }
365
366 // Make sure that the lamport clocks causality match the DAG topology
367 for parent_id in commit.parent_ids() {
368 // All of the parents should already be in the map, as we iter through the
369 // commits in reversed order and thus started with the root commit, without
370 // parents.
371 let Some(parent_operation_pack) = operation_packs.get(&parent_id.detach())
372 else {
373 return Err(read::Error::RootWithParent);
374 };
375
376 if parent_operation_pack.edit_time >= operation_pack.edit_time {
377 return Err(read::Error::ParentWithGreaterEditTime {
378 parent_time: parent_operation_pack.edit_time,
379 child_time: operation_pack.edit_time,
380 });
381 }
382
383 // To avoid an attack where clocks are pushed toward the u64 overflow point, we
384 // make sure that the clocks don't jump too far in the
385 // future. We ignore merge commits here to allow merging
386 // after a long time without breaking anything, as long as
387 // there is one valid chain of small hops, it's fine.
388 if !is_merge
389 && (operation_pack.edit_time.value()
390 - parent_operation_pack.edit_time.value())
391 > 1_000_000
392 {
393 panic!(
394 "Noticed lamport clock jumping too far in the future, this is likely \
395 an attack."
396 )
397 }
398
399 operation_count += operation_pack.operations.len();
400 }
401 assert!(operation_packs.insert(commit.id, operation_pack).is_none());
402 is_first_commit = false;
403 }
404
405 (operation_packs, operation_count)
406 };
407
408 {
409 // Now that we checked, that the clocks are fine, we can update this repo's
410 // clocks.
411
412 // TODO(@bpeetz): Why are we updating the repo's clocks if they are updated
413 // either way on the next write? <2025-04-16>
414 for operation_pack in operation_packs.values() {
415 lamport::repository::get_or_init_clock(
416 replica.repo(),
417 format!("{}-create", Self::NAMESPACE).as_str(),
418 )?
419 .witness(operation_pack.create_time.unwrap_or(lamport::Time::from(0)))?;
420
421 lamport::repository::get_or_init_clock(
422 replica.repo(),
423 format!("{}-edit", Self::NAMESPACE).as_str(),
424 )?
425 .witness(operation_pack.edit_time)?;
426 }
427 }
428
429 let sorted_operation_packs = {
430 // Now that we know that the topological order and clocks are fine, we order the
431 // operation packs based on the logical clocks, entirely ignoring
432 // the DAG topology.
433
434 // Although we checked that every parent has a lower edit time that it's
435 // children, we still need to sort the DAG, because multiple
436 // children where not compared with each other.
437 // Example:
438 // A
439 // / | \
440 // B C D
441 // \ | /
442 // E
443 // In this case, A < B ⋀ A < C ⋀ A < D is true, but we need a linear sorting
444 // instead of this tree.
445
446 let mut operation_packs: Vec<_> = operation_packs.into_values().collect();
447 operation_packs.sort_unstable_by(|a, b| {
448 let cmp = a.edit_time.cmp(&b.edit_time);
449
450 // > We need to check for equal edit times, which would meant that we had
451 // > concurrent edition over different machines, and we can't tell
452 // > which one came first.
453 //
454 // > So, what now? We still need a total ordering and the most stable possible.
455 // > As a secondary ordering, we can order based on a hash of
456 // > the serialized Operations in the operation pack.
457 // > It doesn't carry much meaning but it's unbiased and hard to abuse.
458 // > This is a lexicographic ordering on the stringified ID.
459 //
460 // Git-bug's stores it's IDs as literal strings, as such we unfortunately
461 // cannot use our IDs directly and need to first encode them as hex string, so
462 // that we use the same sorting order as git-bug.
463
464 if cmp == Ordering::Equal {
465 return a.id().to_string().cmp(&b.id().to_string());
466 }
467
468 cmp
469 });
470
471 operation_packs
472 };
473
474 let (operations, create_time, edit_time) = {
475 // Now we can unpack the operation packs.
476
477 let mut operations = Vec::with_capacity(operation_count);
478 let mut create_time = lamport::Time::from(1);
479 let mut edit_time = lamport::Time::from(1);
480
481 for pack in sorted_operation_packs {
482 operations.extend(pack.operations);
483
484 if pack.create_time > Some(create_time) {
485 create_time = pack.create_time.expect("Is some");
486 }
487 if pack.edit_time > edit_time {
488 edit_time = pack.edit_time;
489 }
490 }
491
492 (
493 Operations::from_operations(operations)?,
494 create_time,
495 edit_time,
496 )
497 };
498
499 // Safety:
500 // - We check everything that git-bug also checks, so this should be fine?
501 let me = unsafe { Self::from_parts(operations, create_time, edit_time, last_id) };
502
503 impl_cache! {@populate replica.db, key.as_slice(), &me}
504
505 assert_eq!(me.id(), id, "We should be able to find the correct id");
506
507 Ok(me)
508 }
509}
510
511/// [`Entity`] read errors.
512pub mod read {
513 use super::{
514 Entity, EntityRead, lamport,
515 operation::{operation_pack, operations},
516 };
517 use crate::replica;
518
519 #[allow(missing_docs)]
520 #[derive(Debug, thiserror::Error)]
521 /// The error returned by [`EntityRead::read`].
522 pub enum Error<E: Entity + EntityRead> {
523 #[error(transparent)]
524 ReferenceResolve(#[from] super::find::Error),
525 #[error(transparent)]
526 BreadthFirstSearch(#[from] super::bfs::Error),
527
528 #[error("The operations composing this entity are invalid, because {0}")]
529 WrongOperationsSequence(#[from] operations::create::Error<E>),
530
531 // TODO(@bpeetz): Remove all these validation errors, with a type system that makes it
532 // impossible to parse them (i.e, “Parse don't validate”). <2025-04-15>
533 #[error("Found a merge commit with operations in it.")]
534 MergeWithOperations,
535 #[error("Multiple leafs in the entity DAG")]
536 MultipleLeafs,
537 #[error("The root commit missed it's 'create-clock-' tree entry")]
538 CreationLamportTimeUnset,
539
540 #[error("The root commit seems to have parents?")]
541 RootWithParent,
542
543 #[error("Expected to find a correct operation pack, but found none, because {error}")]
544 MalformedOperationPack {
545 error: operation_pack::decode::Error,
546 },
547
548 #[error("A parent commit had a greater or equal edit lamport time to it's child.'")]
549 ParentWithGreaterEditTime {
550 parent_time: lamport::Time,
551 child_time: lamport::Time,
552 },
553
554 #[error(transparent)]
555 ClockRead(#[from] lamport::repository::get::Error),
556 #[error(transparent)]
557 ClockWitness(#[from] lamport::persistent::write::Error),
558
559 #[error(transparent)]
560 CustomRead(<E as EntityRead>::CustomReadError),
561
562 #[error(transparent)]
563 CacheError(#[from] replica::cache::Error),
564 }
565
566 // TODO(@bpeetz): This doesn't work, because rust thinks that it is the same
567 // as core's `impl<T> From<T> for T`
568 // It is not completely different, but more specific.
569 // As people will just have to implement this themselves for their specific
570 // error, without generics. <2025-04-21>
571 // ```rust
572 // impl<E: Entity + EntityRead> From<<E as EntityRead>::CustomReadError> for Error<E> {
573 // fn from(value: <E as EntityRead>::CustomReadError) -> Self {
574 // Self::CustomRead(value)
575 // }
576 // }
577 // ```
578}