Skip to main content

mkit_core/ops/
graph.rs

1//! Commit-graph traversal helpers.
2//!
3//! This module exposes [`collect_ancestor_set`]. Higher-level walks
4//! (`is_ancestor`, `find_merge_base`) live in [`super::merge`] alongside
5//! the merge algorithm that consumes them.
6//!
7//! Bound: at most [`MAX_ANCESTORS`] (`10_000`) commits are visited per
8//! call. Beyond that the walk stops silently — callers asking about
9//! pathologically deep histories get a partial answer rather than an
10//! OOM.
11
12use std::collections::{BTreeSet, HashSet, VecDeque};
13use std::hash::BuildHasher;
14
15use crate::hash::Hash;
16use crate::object::Object;
17use crate::store::{ObjectStore, StoreError};
18
19/// Hard cap on commits visited per call.
20pub const MAX_ANCESTORS: usize = 10_000;
21
22/// Collect the set of all ancestor commits of `start`, including
23/// `start` itself, by DFS over `Commit::parents`. The walk:
24///
25/// * Adds `start` to `set` even if its object is not in the store
26///   (the hash is recorded, then the missing-object error
27///   short-circuits the parent walk for that node).
28/// * Treats non-commit objects at a hash as terminators (no parents to
29///   follow).
30/// * Stops cleanly after [`MAX_ANCESTORS`] inserts.
31///
32/// # Errors
33///
34/// Only [`StoreError::Io`] / [`StoreError::HashMismatch`] /
35/// [`StoreError::ObjectTooLarge`] / [`StoreError::Decode`] propagate.
36/// `ObjectNotFound` is *swallowed* — see test
37/// `handles_non_existent_parent_gracefully`.
38pub fn collect_ancestor_set<S: BuildHasher>(
39    store: &ObjectStore,
40    start: Hash,
41    set: &mut HashSet<Hash, S>,
42) -> Result<(), StoreError> {
43    let mut stack: Vec<Hash> = Vec::new();
44    stack.push(start);
45
46    let mut count: usize = 0;
47    while let Some(current) = stack.pop() {
48        if count >= MAX_ANCESTORS {
49            break;
50        }
51        if !set.insert(current) {
52            continue;
53        }
54        count += 1;
55
56        match store.read_object(&current) {
57            Ok(Object::Commit(c)) => {
58                for &parent in &c.parents {
59                    stack.push(parent);
60                }
61            }
62            // Non-commit (or unreadable) — stop walking from this node,
63            // but we keep the hash in `set`.
64            Ok(_) | Err(StoreError::ObjectNotFound(_)) => {}
65            Err(e) => return Err(e),
66        }
67    }
68    Ok(())
69}
70
71/// Hard cap on the total number of objects [`reachable_objects`] will
72/// visit per call. Matches the scale of [`MAX_ANCESTORS`] but applies
73/// to the full object closure (commits + trees + blobs + chunks), so it
74/// is intentionally larger. A repo that exceeds this cap has to split
75/// pushes — the push-path is the only caller for now.
76pub const MAX_REACHABLE: usize = 10_000_000;
77
78/// Collect every object reachable from commit `root` — the full closure
79/// needed to reconstruct the commit on a fresh store. Walks, in order:
80///
81/// 1. `root` commit → its `tree_hash` + every `parent`.
82/// 2. each tree → every entry's `object_hash` (blob / tree / chunked-blob).
83/// 3. nested trees → recurse.
84/// 4. chunked-blob manifests → every `chunks[i]` hash.
85///
86/// Deltas are not produced by any path the push code drives, and remix
87/// objects are walked like commits (tree + parents) per SPEC-OBJECTS §6.
88///
89/// Returns a [`BTreeSet`] so iteration order is deterministic — the
90/// push code uses that to build reproducible packfiles. Deduplication
91/// happens naturally via the set.
92///
93/// # Errors
94///
95/// - [`StoreError::ObjectNotFound`] if `root` itself is missing from
96///   the store. Missing *referenced* objects (e.g. a tree listing a
97///   blob that got pruned) are a harder failure and propagate via the
98///   same variant — callers pushing partial repos should fix their
99///   store before pushing.
100/// - Other [`StoreError`] variants propagate as-is.
101pub fn reachable_objects(store: &ObjectStore, root: &Hash) -> Result<BTreeSet<Hash>, StoreError> {
102    reachable_closure(store, std::iter::once(root))
103}
104
105/// Collect every object reachable from **any** of `roots` — the
106/// multi-root generalization of [`reachable_objects`]. This is the
107/// closure `mkit gc` (#233) keeps live: seed it with the full retention
108/// root set from [`super::gc::collect_roots`] and every object NOT in the
109/// result is unreachable.
110///
111/// Identical walk semantics to [`reachable_objects`] (commits/remixes →
112/// tree + parents, trees → entries, chunked-blobs → chunks, tags →
113/// target, blobs/deltas are leaves), deduped via the returned
114/// [`BTreeSet`], and capped at [`MAX_REACHABLE`] total objects. With a
115/// single root the result is byte-identical to [`reachable_objects`].
116///
117/// # Errors
118///
119/// Propagates [`StoreError`] as [`reachable_objects`] does — notably
120/// [`StoreError::ObjectNotFound`] for a missing root or referenced
121/// object, so a caller (gc) fails closed rather than under-counting the
122/// live set.
123pub fn reachable_closure<'a, I>(store: &ObjectStore, roots: I) -> Result<BTreeSet<Hash>, StoreError>
124where
125    I: IntoIterator<Item = &'a Hash>,
126{
127    // Push-path callers tolerate cap truncation (they split pushes), so
128    // the truncation flag is dropped here. gc must NOT — it uses
129    // [`reachable_closure_checked`] and fails closed.
130    reachable_closure_checked(store, roots).map(|(out, _truncated)| out)
131}
132
133/// Like [`reachable_closure`] but also reports whether the
134/// [`MAX_REACHABLE`] cap truncated the walk (`true` = incomplete). A
135/// caller that would *delete* unreachable objects (gc) MUST treat
136/// `truncated == true` as fatal: beyond the cap the "unreachable" verdict
137/// is unsound, so pruning would drop live data.
138///
139/// # Errors
140///
141/// Propagates [`StoreError`] as [`reachable_objects`] does.
142pub fn reachable_closure_checked<'a, I>(
143    store: &ObjectStore,
144    roots: I,
145) -> Result<(BTreeSet<Hash>, bool), StoreError>
146where
147    I: IntoIterator<Item = &'a Hash>,
148{
149    let mut out: BTreeSet<Hash> = BTreeSet::new();
150    let mut queue: VecDeque<Hash> = VecDeque::new();
151    for root in roots {
152        queue.push_back(*root);
153    }
154
155    let mut truncated = false;
156    while let Some(h) = queue.pop_front() {
157        if out.len() >= MAX_REACHABLE {
158            // Still had work to do but hit the cap — the closure is
159            // incomplete.
160            truncated = true;
161            break;
162        }
163        if !out.insert(h) {
164            continue;
165        }
166        // Read the object. Missing objects bubble up — a reachable-set
167        // walk for a push must refuse to build a known-broken pack.
168        let obj = store.read_object(&h)?;
169        match obj {
170            Object::Commit(c) => {
171                queue.push_back(c.tree_hash);
172                for p in c.parents {
173                    queue.push_back(p);
174                }
175            }
176            Object::Remix(r) => {
177                queue.push_back(r.tree_hash);
178                for p in r.parents {
179                    queue.push_back(p);
180                }
181                // Remix `sources` are foreign-repo pointers (SPEC-OBJECTS
182                // §6) — by definition NOT in our store, so don't queue them.
183            }
184            Object::Tree(t) => {
185                for e in t.entries {
186                    queue.push_back(e.object_hash);
187                }
188            }
189            Object::ChunkedBlob(cb) => {
190                for c in cb.chunks {
191                    queue.push_back(c);
192                }
193            }
194            Object::Tag(t) => {
195                // An annotated/signed tag points at one target object
196                // (SPEC-OBJECTS §6a). Walk it so a tag is a valid pack
197                // root.
198                queue.push_back(t.target);
199            }
200            Object::Blob(_) | Object::Delta(_) => {
201                // Leaves — nothing to walk.
202            }
203        }
204    }
205    Ok((out, truncated))
206}
207
208// =====================================================================
209// Tests
210// =====================================================================
211
212#[cfg(test)]
213#[allow(clippy::many_single_char_names)] // single-letter commit names keep the test tables compact
214mod tests {
215    use super::*;
216    use crate::hash;
217    use crate::object::EntryMode;
218    use crate::object::{Blob, Commit, Identity, Object, Tree, TreeEntry};
219    use crate::serialize;
220    use tempfile::TempDir;
221
222    fn store() -> (TempDir, ObjectStore) {
223        let d = TempDir::new().unwrap();
224        let s = ObjectStore::init(d.path()).unwrap();
225        (d, s)
226    }
227
228    fn put_blob(s: &ObjectStore, data: &[u8]) -> Hash {
229        let bytes = serialize::serialize(&Object::Blob(Blob {
230            data: data.to_vec(),
231        }))
232        .unwrap();
233        s.write(&bytes).unwrap()
234    }
235
236    fn put_tree(s: &ObjectStore, entries: Vec<TreeEntry>) -> Hash {
237        let bytes = serialize::serialize(&Object::Tree(Tree { entries })).unwrap();
238        s.write(&bytes).unwrap()
239    }
240
241    fn make_single_file_tree(s: &ObjectStore, name: &[u8], data: &[u8]) -> Hash {
242        let blob = put_blob(s, data);
243        put_tree(
244            s,
245            vec![TreeEntry {
246                name: name.to_vec(),
247                mode: EntryMode::Blob,
248                object_hash: blob,
249            }],
250        )
251    }
252
253    fn make_commit(s: &ObjectStore, tree: Hash, parents: &[Hash], message: &str) -> Hash {
254        let c = Commit {
255            tree_hash: tree,
256            parents: parents.to_vec(),
257            author: Identity::ed25519([0; 32]),
258            signer: [0; 32],
259            message: message.as_bytes().to_vec(),
260            timestamp: message.len() as u64, // tiny per-commit divergence avoids store dedup
261            message_hash: [0; 32],
262            content_digest: [0; 32],
263            signature: [0; 64],
264        };
265        let bytes = serialize::serialize(&Object::Commit(c)).unwrap();
266        s.write(&bytes).unwrap()
267    }
268
269    #[test]
270    fn linear_chain_3_commits() {
271        let (_d, s) = store();
272        let tree = make_single_file_tree(&s, b"f", b"data");
273        let c1 = make_commit(&s, tree, &[], "c1");
274        let c2 = make_commit(&s, tree, &[c1], "c2");
275        let c3 = make_commit(&s, tree, &[c2], "c3");
276
277        let mut set = HashSet::new();
278        collect_ancestor_set(&s, c3, &mut set).unwrap();
279        assert_eq!(set.len(), 3);
280        assert!(set.contains(&c1));
281        assert!(set.contains(&c2));
282        assert!(set.contains(&c3));
283    }
284
285    #[test]
286    fn diamond_dag() {
287        let (_d, s) = store();
288        let tree = make_single_file_tree(&s, b"f.txt", b"data");
289        let c1 = make_commit(&s, tree, &[], "c1");
290        let c2 = make_commit(&s, tree, &[c1], "c2");
291        let c3 = make_commit(&s, tree, &[c1], "c3");
292        let c4 = make_commit(&s, tree, &[c2, c3], "c4");
293
294        let mut set = HashSet::new();
295        collect_ancestor_set(&s, c4, &mut set).unwrap();
296        assert_eq!(set.len(), 4);
297        assert!(set.contains(&c1));
298        assert!(set.contains(&c2));
299        assert!(set.contains(&c3));
300        assert!(set.contains(&c4));
301    }
302
303    #[test]
304    fn root_commit_alone() {
305        let (_d, s) = store();
306        let tree = make_single_file_tree(&s, b"f.txt", b"data");
307        let c1 = make_commit(&s, tree, &[], "root");
308
309        let mut set = HashSet::new();
310        collect_ancestor_set(&s, c1, &mut set).unwrap();
311        assert_eq!(set.len(), 1);
312        assert!(set.contains(&c1));
313    }
314
315    #[test]
316    fn handles_non_existent_parent_gracefully() {
317        let (_d, s) = store();
318        let tree = make_single_file_tree(&s, b"f.txt", b"data");
319        let fake_parent = hash::hash(b"nonexistent-parent");
320        let c1 = make_commit(&s, tree, &[fake_parent], "orphan");
321
322        let mut set = HashSet::new();
323        collect_ancestor_set(&s, c1, &mut set).unwrap();
324        // Both c1 and the fake hash end up in the set; the fake hash
325        // doesn't continue the walk because the object lookup fails.
326        assert_eq!(set.len(), 2);
327        assert!(set.contains(&c1));
328        assert!(set.contains(&fake_parent));
329    }
330
331    #[test]
332    fn empty_store_records_starting_hash() {
333        let (_d, s) = store();
334        let fake = hash::hash(b"does-not-exist");
335        let mut set = HashSet::new();
336        collect_ancestor_set(&s, fake, &mut set).unwrap();
337        assert_eq!(set.len(), 1);
338        assert!(set.contains(&fake));
339    }
340
341    // =================================================================
342    // reachable_objects — full closure from a commit.
343    // =================================================================
344
345    #[test]
346    fn reachable_single_commit_includes_tree_and_blob() {
347        let (_d, s) = store();
348        let blob = put_blob(&s, b"hi");
349        let tree = put_tree(
350            &s,
351            vec![TreeEntry {
352                name: b"f".to_vec(),
353                mode: EntryMode::Blob,
354                object_hash: blob,
355            }],
356        );
357        let c1 = make_commit(&s, tree, &[], "c1");
358        let reach = reachable_objects(&s, &c1).unwrap();
359        assert!(reach.contains(&c1));
360        assert!(reach.contains(&tree));
361        assert!(reach.contains(&blob));
362        assert_eq!(reach.len(), 3);
363    }
364
365    #[test]
366    fn reachable_walks_parents_and_dedups() {
367        let (_d, s) = store();
368        let t = make_single_file_tree(&s, b"f", b"x");
369        let c1 = make_commit(&s, t, &[], "c1");
370        let c2 = make_commit(&s, t, &[c1], "c2");
371        let reach = reachable_objects(&s, &c2).unwrap();
372        // c1, c2, one shared tree, one shared blob
373        assert_eq!(reach.len(), 4);
374        assert!(reach.contains(&c1));
375        assert!(reach.contains(&c2));
376        assert!(reach.contains(&t));
377    }
378
379    #[test]
380    fn reachable_walks_nested_trees() {
381        let (_d, s) = store();
382        let leaf = put_blob(&s, b"leaf");
383        let inner = put_tree(
384            &s,
385            vec![TreeEntry {
386                name: b"leaf".to_vec(),
387                mode: EntryMode::Blob,
388                object_hash: leaf,
389            }],
390        );
391        let outer = put_tree(
392            &s,
393            vec![TreeEntry {
394                name: b"sub".to_vec(),
395                mode: EntryMode::Tree,
396                object_hash: inner,
397            }],
398        );
399        let c1 = make_commit(&s, outer, &[], "c");
400        let reach = reachable_objects(&s, &c1).unwrap();
401        assert!(reach.contains(&outer));
402        assert!(reach.contains(&inner));
403        assert!(reach.contains(&leaf));
404    }
405
406    #[test]
407    fn reachable_walks_chunked_blob_chunks() {
408        use crate::object::ChunkedBlob;
409        let (_d, s) = store();
410        let c0 = put_blob(&s, b"A");
411        let c1 = put_blob(&s, b"B");
412        let cb_bytes = serialize::serialize(&Object::ChunkedBlob(ChunkedBlob {
413            total_size: 2,
414            chunk_size: 1,
415            chunks: vec![c0, c1],
416        }))
417        .unwrap();
418        let cb = s.write(&cb_bytes).unwrap();
419        let tree = put_tree(
420            &s,
421            vec![TreeEntry {
422                name: b"big".to_vec(),
423                mode: EntryMode::Blob,
424                object_hash: cb,
425            }],
426        );
427        let commit = make_commit(&s, tree, &[], "c");
428        let reach = reachable_objects(&s, &commit).unwrap();
429        assert!(reach.contains(&cb));
430        assert!(reach.contains(&c0));
431        assert!(reach.contains(&c1));
432    }
433
434    #[test]
435    fn reachable_missing_root_errors() {
436        let (_d, s) = store();
437        let fake = hash::hash(b"nope");
438        let err = reachable_objects(&s, &fake).unwrap_err();
439        assert!(matches!(err, StoreError::ObjectNotFound(_)));
440    }
441
442    #[test]
443    fn reachable_is_deterministic() {
444        let (_d, s) = store();
445        let t = make_single_file_tree(&s, b"f", b"x");
446        let c1 = make_commit(&s, t, &[], "c1");
447        let c2 = make_commit(&s, t, &[c1], "c2");
448        let a: Vec<Hash> = reachable_objects(&s, &c2).unwrap().into_iter().collect();
449        let b: Vec<Hash> = reachable_objects(&s, &c2).unwrap().into_iter().collect();
450        assert_eq!(a, b);
451    }
452}