1use std::collections::BTreeSet;
32use std::fs;
33use std::io;
34use std::path::Path;
35
36use crate::hash::{self, Hash};
37use crate::index;
38use crate::store::{ObjectStore, StoreError};
39
40use super::conflict_state::{self, ORIG_HEAD};
41use super::graph::reachable_closure_checked;
42use super::rebase::{self, REBASE_DIR};
43use super::recovery;
44use super::stash;
45use crate::refs::{self, HEADS_DIR, REMOTES_DIR, TAGS_DIR};
46
47const ATTESTATIONS_DIR: &str = "attestations";
51
52const MAX_REF_WALK_DEPTH: usize = 64;
56
57#[derive(Debug, thiserror::Error)]
64#[non_exhaustive]
65pub enum GcRootsError {
66 #[error("refs: {0}")]
67 Refs(#[from] refs::RefError),
68 #[error("stash: {0}")]
69 Stash(#[from] stash::StashError),
70 #[error("conflict state: {0}")]
71 ConflictState(#[from] conflict_state::ConflictStateError),
72 #[error("rebase state: {0}")]
73 Rebase(#[from] rebase::RebaseError),
74 #[error("recovery log: {0}")]
75 Recovery(#[from] recovery::RecoveryError),
76 #[error("staging index: {0}")]
77 Index(#[from] index::IndexError),
78 #[error("object store: {0}")]
79 Store(#[from] StoreError),
80 #[error("malformed object id on disk: {0}")]
81 BadHash(#[from] hash::FromHexError),
82 #[error("io: {0}")]
83 Io(#[from] io::Error),
84 #[error("object graph exceeds the reachability cap; refusing to compute a partial keep-set")]
88 Truncated,
89 #[error("ref tree too deep at {0} (fail closed)")]
91 RefTooDeep(String),
92 #[error("refusing to gc: {0} is a symlink (objects may live outside the repo)")]
96 SymlinkedStore(String),
97}
98
99pub fn collect_roots(mkit_dir: &Path) -> Result<BTreeSet<Hash>, GcRootsError> {
111 let mut roots: BTreeSet<Hash> = BTreeSet::new();
112 let add = |h: Hash, set: &mut BTreeSet<Hash>| {
113 if h != hash::ZERO {
114 set.insert(h);
115 }
116 };
117
118 if let Some(h) = refs::resolve_head(mkit_dir)? {
120 add(h, &mut roots);
121 }
122
123 for ns in [HEADS_DIR, TAGS_DIR, REMOTES_DIR] {
132 walk_ref_roots_strict(&mkit_dir.join(ns), ns, 0, &mut roots)?;
133 }
134
135 let repo_root = mkit_dir.parent().unwrap_or(mkit_dir);
137 for entry in stash::list(repo_root)?.entries {
138 add(entry.commit_hash, &mut roots);
139 add(entry.parent_hash, &mut roots);
140 }
141
142 for entry in index::read_index(repo_root)?.entries {
149 add(entry.object_hash, &mut roots);
150 }
151
152 if let Some(h) = read_optional_hash(&mkit_dir.join(ORIG_HEAD))? {
154 add(h, &mut roots);
155 }
156
157 if let Some(m) = conflict_state::read_merge_state(mkit_dir)? {
159 add(m.merge_head, &mut roots);
160 add(m.orig_head, &mut roots);
161 }
162 if let Some(c) = conflict_state::read_cherry_pick_state(mkit_dir)? {
163 add(c.cherry_pick_head, &mut roots);
164 add(c.orig_head, &mut roots);
165 }
166 if let Some(r) = conflict_state::read_revert_state(mkit_dir)? {
167 add(r.revert_head, &mut roots);
168 add(r.orig_head, &mut roots);
169 }
170
171 if rebase::is_rebase_in_progress(mkit_dir) {
174 let st = rebase::read_state(mkit_dir)?;
175 add(st.orig_head, &mut roots);
176 add(st.onto, &mut roots);
177 for h in st.todo.into_iter().chain(st.done) {
178 add(h, &mut roots);
179 }
180 }
181
182 for dir in [mkit_dir.to_path_buf(), mkit_dir.join(REBASE_DIR)] {
187 for c in conflict_state::read_conflicts(&dir)? {
188 for h in [c.base_hash, c.ours_hash, c.theirs_hash]
189 .into_iter()
190 .flatten()
191 {
192 add(h, &mut roots);
193 }
194 }
195 }
196
197 for h in attested_commits(mkit_dir)? {
199 add(h, &mut roots);
200 }
201
202 for h in recovery::roots(mkit_dir)? {
207 add(h, &mut roots);
208 }
209
210 Ok(roots)
211}
212
213pub fn live_objects(store: &ObjectStore, mkit_dir: &Path) -> Result<BTreeSet<Hash>, GcRootsError> {
221 let roots = collect_roots(mkit_dir)?;
222 let (live, truncated) = reachable_closure_checked(store, roots.iter())?;
223 if truncated {
224 return Err(GcRootsError::Truncated);
225 }
226 Ok(live)
227}
228
229#[derive(Debug, Default, Clone, Copy)]
231pub struct GcReport {
232 pub scanned: usize,
234 pub live: usize,
236 pub kept_recent: usize,
239 pub pruned: usize,
241 pub bytes_reclaimed: u64,
243 pub dry_run: bool,
245}
246
247pub fn run_gc(
262 store: &ObjectStore,
263 mkit_dir: &Path,
264 now_secs: u64,
265 grace_secs: u64,
266 dry_run: bool,
267) -> Result<GcReport, GcRootsError> {
268 reject_symlink(mkit_dir)?;
273 reject_symlink(store.objects_root())?;
274
275 let live = live_objects(store, mkit_dir)?;
277 let all = store.iter_object_hashes()?;
278
279 let mut report = GcReport {
280 dry_run,
281 ..GcReport::default()
282 };
283 for h in all {
284 report.scanned += 1;
285 if live.contains(&h) {
286 report.live += 1;
287 continue;
288 }
289 let Ok(meta) = store.object_metadata(&h) else {
293 report.kept_recent += 1;
294 continue;
295 };
296 let age_known_old = meta
297 .modified()
298 .ok()
299 .and_then(|t| t.duration_since(std::time::UNIX_EPOCH).ok())
300 .map(|d| d.as_secs())
301 .is_some_and(|mtime| now_secs.saturating_sub(mtime) >= grace_secs);
302 if !age_known_old {
303 report.kept_recent += 1;
304 continue;
305 }
306 let len = meta.len();
307 if !dry_run {
308 store.remove_object(&h)?;
309 }
310 report.pruned += 1;
311 report.bytes_reclaimed += len;
312 }
313 Ok(report)
314}
315
316fn walk_ref_roots_strict(
326 dir: &Path,
327 rel: &str,
328 depth: usize,
329 roots: &mut BTreeSet<Hash>,
330) -> Result<(), GcRootsError> {
331 if depth > MAX_REF_WALK_DEPTH {
332 return Err(GcRootsError::RefTooDeep(rel.to_owned()));
333 }
334 let rd = match fs::read_dir(dir) {
335 Ok(rd) => rd,
336 Err(e) if e.kind() == io::ErrorKind::NotFound => return Ok(()),
337 Err(e) => return Err(e.into()),
338 };
339 for entry in rd {
340 let entry = entry?;
341 let name = entry.file_name();
342 let name = name.to_string_lossy();
343 let ft = entry.file_type()?;
344 if ft.is_dir() {
345 walk_ref_roots_strict(&entry.path(), &format!("{rel}/{name}"), depth + 1, roots)?;
346 continue;
347 }
348 if !ft.is_file() || name.starts_with('.') {
349 continue;
351 }
352 let raw = fs::read_to_string(entry.path())?;
354 let h = hash::from_hex(raw.trim())?;
355 if h != hash::ZERO {
356 roots.insert(h);
357 }
358 }
359 Ok(())
360}
361
362fn attested_commits(mkit_dir: &Path) -> Result<Vec<Hash>, io::Error> {
366 let dir = mkit_dir.join(ATTESTATIONS_DIR);
367 let mut out = Vec::new();
368 let rd = match fs::read_dir(&dir) {
369 Ok(rd) => rd,
370 Err(e) if e.kind() == io::ErrorKind::NotFound => return Ok(out),
371 Err(e) => return Err(e),
372 };
373 for entry in rd {
374 let entry = entry?;
375 if !entry.file_type()?.is_dir() {
376 continue;
377 }
378 if let Some(name) = entry.file_name().to_str()
379 && let Ok(h) = hash::from_hex(name)
380 {
381 out.push(h);
382 }
383 }
384 Ok(out)
385}
386
387fn read_optional_hash(path: &Path) -> Result<Option<Hash>, GcRootsError> {
390 let raw = match fs::read_to_string(path) {
391 Ok(s) => s,
392 Err(e) if e.kind() == io::ErrorKind::NotFound => return Ok(None),
393 Err(e) => return Err(e.into()),
394 };
395 let trimmed = raw.trim();
396 if trimmed.is_empty() {
397 return Ok(None);
398 }
399 Ok(Some(hash::from_hex(trimmed)?))
400}
401
402fn reject_symlink(path: &Path) -> Result<(), GcRootsError> {
409 match std::fs::symlink_metadata(path) {
410 Ok(m) if m.file_type().is_symlink() => {
411 Err(GcRootsError::SymlinkedStore(path.display().to_string()))
412 }
413 Ok(_) => Ok(()),
414 Err(e) if e.kind() == io::ErrorKind::NotFound => Ok(()),
415 Err(e) => Err(e.into()),
416 }
417}
418
419#[cfg(test)]
420mod tests {
421 use super::*;
422 use crate::object::EntryMode;
423 use crate::object::{Blob, Commit, Identity, Object, Tree, TreeEntry};
424 use crate::serialize;
425 use std::fs;
426 use tempfile::TempDir;
427
428 fn repo() -> (TempDir, ObjectStore) {
430 let d = TempDir::new().unwrap();
431 let store = ObjectStore::init(d.path()).unwrap();
432 refs::init(&d.path().join(crate::MKIT_DIR)).unwrap();
433 (d, store)
434 }
435
436 fn mkit_dir(d: &TempDir) -> std::path::PathBuf {
437 d.path().join(crate::MKIT_DIR)
438 }
439
440 fn write_ref(md: &Path, rel: &str, h: &Hash) {
443 let path = md.join(rel);
444 fs::create_dir_all(path.parent().unwrap()).unwrap();
445 fs::write(path, format!("{}\n", hash::to_hex(h))).unwrap();
446 }
447
448 fn write_blob(s: &ObjectStore, data: &[u8]) -> Hash {
449 s.write(
450 &serialize::serialize(&Object::Blob(Blob {
451 data: data.to_vec(),
452 }))
453 .unwrap(),
454 )
455 .unwrap()
456 }
457
458 fn commit_one(s: &ObjectStore, name: &[u8], data: &[u8], parents: Vec<Hash>) -> (Hash, Hash) {
460 let blob = write_blob(s, data);
461 let tree = s
462 .write(
463 &serialize::serialize(&Object::Tree(Tree {
464 entries: vec![TreeEntry {
465 name: name.to_vec(),
466 mode: EntryMode::Blob,
467 object_hash: blob,
468 }],
469 }))
470 .unwrap(),
471 )
472 .unwrap();
473 let commit = s
474 .write(
475 &serialize::serialize(&Object::Commit(Commit {
476 tree_hash: tree,
477 parents,
478 author: Identity::opaque(b"t".to_vec()),
479 signer: [0u8; 32],
480 message: name.to_vec(),
481 timestamp: name.len() as u64,
483 message_hash: [0u8; 32],
484 content_digest: [0u8; 32],
485 signature: [0u8; 64],
486 }))
487 .unwrap(),
488 )
489 .unwrap();
490 (commit, blob)
491 }
492
493 #[test]
494 fn collect_roots_includes_branches_and_tags() {
495 let (d, s) = repo();
496 let md = mkit_dir(&d);
497 let (c1, _) = commit_one(&s, b"a", b"a", vec![]);
498 let (c2, _) = commit_one(&s, b"b", b"b", vec![]);
499 write_ref(&md, "refs/heads/main", &c1);
500 write_ref(&md, "refs/tags/v1", &c2);
501
502 let roots = collect_roots(&md).unwrap();
503 assert!(roots.contains(&c1), "branch tip must be a root");
504 assert!(roots.contains(&c2), "tag target must be a root");
505 }
506
507 #[test]
508 fn collect_roots_includes_orig_head_and_attested_commit() {
509 let (d, s) = repo();
510 let md = mkit_dir(&d);
511 let (orig, _) = commit_one(&s, b"o", b"o", vec![]);
512 let (att, _) = commit_one(&s, b"x", b"x", vec![]);
513 fs::write(md.join(ORIG_HEAD), format!("{}\n", hash::to_hex(&orig))).unwrap();
514 fs::create_dir_all(md.join(ATTESTATIONS_DIR).join(hash::to_hex(&att))).unwrap();
515
516 let roots = collect_roots(&md).unwrap();
517 assert!(roots.contains(&orig), "ORIG_HEAD must be a root");
518 assert!(roots.contains(&att), "attested commit must be a root");
519 }
520
521 #[test]
522 fn live_objects_keeps_only_reachable_closure() {
523 let (d, s) = repo();
524 let md = mkit_dir(&d);
525 let (kept, kept_blob) = commit_one(&s, b"keep", b"keep", vec![]);
526 let (orphan, orphan_blob) = commit_one(&s, b"orphan", b"orphan", vec![]);
528 write_ref(&md, "refs/heads/main", &kept);
529
530 let live = live_objects(&s, &md).unwrap();
531 assert!(
532 live.contains(&kept) && live.contains(&kept_blob),
533 "kept closure live"
534 );
535 assert!(
536 !live.contains(&orphan) && !live.contains(&orphan_blob),
537 "unreferenced objects must not be live"
538 );
539 }
540
541 #[test]
542 fn reachable_closure_is_union_of_single_root_closures() {
543 let (_d, s) = repo();
544 let (c1, b1) = commit_one(&s, b"a", b"a", vec![]);
545 let (c2, b2) = commit_one(&s, b"b", b"b", vec![]);
546 let multi = super::super::graph::reachable_closure(&s, [&c1, &c2]).unwrap();
547 let single1 = super::super::graph::reachable_objects(&s, &c1).unwrap();
548 let single2 = super::super::graph::reachable_objects(&s, &c2).unwrap();
549 let union: BTreeSet<Hash> = single1.union(&single2).copied().collect();
550 assert_eq!(multi, union);
551 assert!([c1, b1, c2, b2].iter().all(|h| multi.contains(h)));
552 }
553
554 #[test]
555 fn strict_walk_picks_up_nested_remote_ref() {
556 let (d, s) = repo();
557 let md = mkit_dir(&d);
558 let (c, _) = commit_one(&s, b"r", b"r", vec![]);
559 write_ref(&md, "refs/remotes/origin/main", &c);
560 assert!(
561 collect_roots(&md).unwrap().contains(&c),
562 "nested remote-tracking ref must be a root"
563 );
564 }
565
566 #[test]
567 fn run_gc_prunes_orphans_but_never_a_live_object() {
568 let (d, s) = repo();
569 let md = mkit_dir(&d);
570 let (kept, kept_blob) = commit_one(&s, b"keep", b"keep", vec![]);
572 write_ref(&md, "refs/heads/main", &kept);
573 let live = live_objects(&s, &md).unwrap();
574 let (orphan, orphan_blob) = commit_one(&s, b"orphan", b"orphan", vec![]);
576
577 let report = run_gc(&s, &md, u64::MAX, 0, false).unwrap();
579
580 for h in &live {
582 assert!(s.contains(h), "gc must never delete a live object");
583 }
584 assert!(
586 !s.contains(&orphan) && !s.contains(&orphan_blob),
587 "orphans pruned"
588 );
589 assert_eq!(report.live, live.len());
590 assert!(
591 report.pruned >= 2,
592 "orphan commit + blob pruned: {report:?}"
593 );
594 assert!(s.contains(&kept) && s.contains(&kept_blob));
596 }
597
598 #[test]
599 fn run_gc_keeps_staged_but_uncommitted_blobs() {
600 let (d, s) = repo();
601 let md = mkit_dir(&d);
602 let (kept, _) = commit_one(&s, b"k", b"k", vec![]);
604 write_ref(&md, "refs/heads/main", &kept);
605 let staged = write_blob(&s, b"staged-only");
608 let idx = index::Index {
609 entries: vec![index::IndexEntry {
610 path: "staged.txt".into(),
611 status: index::EntryStatus::Blob,
612 object_hash: staged,
613 mtime_ns: 0,
614 size: 0,
615 ino: 0,
616 ctime_ns: 0,
617 }],
618 };
619 index::write_index(d.path(), &idx).unwrap();
620
621 assert!(
622 collect_roots(&md).unwrap().contains(&staged),
623 "staged blob must be a retention root"
624 );
625 run_gc(&s, &md, u64::MAX, 0, false).unwrap();
627 assert!(
628 s.contains(&staged),
629 "gc must never delete staged-but-uncommitted content"
630 );
631 }
632
633 #[test]
634 fn run_gc_grace_window_keeps_recent_orphans() {
635 let (d, s) = repo();
636 let md = mkit_dir(&d);
637 let (kept, _) = commit_one(&s, b"k", b"k", vec![]);
638 write_ref(&md, "refs/heads/main", &kept);
639 let (orphan, _) = commit_one(&s, b"o", b"o", vec![]);
640
641 let report = run_gc(&s, &md, 0, u64::MAX, false).unwrap();
644 assert!(s.contains(&orphan), "recent orphan kept by grace window");
645 assert_eq!(report.pruned, 0);
646 assert!(report.kept_recent >= 1, "{report:?}");
647 }
648
649 #[cfg(unix)]
650 #[test]
651 fn run_gc_refuses_symlinked_objects_dir() {
652 use std::os::unix::fs::symlink;
653 let (d, s) = repo();
654 let md = mkit_dir(&d);
655 let (kept, _) = commit_one(&s, b"k", b"k", vec![]);
656 write_ref(&md, "refs/heads/main", &kept);
657
658 let external = d.path().join("external-objects");
661 let real_objects = md.join("objects");
662 fs::create_dir_all(&external).unwrap();
663 for entry in fs::read_dir(&real_objects).unwrap() {
665 let entry = entry.unwrap();
666 fs::rename(entry.path(), external.join(entry.file_name())).unwrap();
667 }
668 fs::remove_dir_all(&real_objects).unwrap();
669 symlink(&external, &real_objects).unwrap();
670
671 let err = run_gc(&s, &md, u64::MAX, 0, false).unwrap_err();
672 assert!(
673 matches!(err, GcRootsError::SymlinkedStore(_)),
674 "gc must refuse a symlinked objects dir, got {err:?}"
675 );
676 }
677
678 #[test]
679 fn run_gc_dry_run_deletes_nothing() {
680 let (d, s) = repo();
681 let md = mkit_dir(&d);
682 let (kept, _) = commit_one(&s, b"k", b"k", vec![]);
683 write_ref(&md, "refs/heads/main", &kept);
684 let (orphan, _) = commit_one(&s, b"o", b"o", vec![]);
685
686 let report = run_gc(&s, &md, u64::MAX, 0, true).unwrap();
687 assert!(report.dry_run && report.pruned >= 1, "{report:?}");
688 assert!(s.contains(&orphan), "dry run must not delete the orphan");
689 }
690
691 #[test]
692 fn run_gc_keeps_recovery_logged_orphan() {
693 let (d, s) = repo();
694 let md = mkit_dir(&d);
695 let (kept, _) = commit_one(&s, b"k", b"k", vec![]);
696 write_ref(&md, "refs/heads/main", &kept);
697 let (superseded, superseded_blob) = commit_one(&s, b"old", b"old", vec![]);
699 super::super::recovery::record(
700 &md,
701 &super::super::recovery::RecoveryEntry {
702 timestamp: 1,
703 op: "amend".into(),
704 superseded,
705 branch: "main".into(),
706 },
707 )
708 .unwrap();
709
710 run_gc(&s, &md, u64::MAX, 0, false).unwrap();
711 assert!(
712 s.contains(&superseded) && s.contains(&superseded_blob),
713 "a recovery-logged commit must not be pruned"
714 );
715 }
716
717 #[test]
718 fn collect_roots_includes_recovery_log_entries() {
719 let (d, s) = repo();
720 let md = mkit_dir(&d);
721 let (superseded, _) = commit_one(&s, b"old", b"old", vec![]);
722 super::super::recovery::record(
723 &md,
724 &super::super::recovery::RecoveryEntry {
725 timestamp: 1,
726 op: "amend".into(),
727 superseded,
728 branch: "main".into(),
729 },
730 )
731 .unwrap();
732 assert!(
733 collect_roots(&md).unwrap().contains(&superseded),
734 "a superseded commit in the recovery log must be a root"
735 );
736 }
737
738 #[test]
739 fn collect_roots_fails_closed_on_malformed_ref() {
740 let (d, _s) = repo();
741 let md = mkit_dir(&d);
742 let bad = md.join("refs/heads/corrupt");
745 fs::create_dir_all(bad.parent().unwrap()).unwrap();
746 fs::write(&bad, b"not-a-valid-object-id\n").unwrap();
747 assert!(
748 matches!(collect_roots(&md), Err(GcRootsError::BadHash(_))),
749 "malformed ref must fail closed"
750 );
751 }
752
753 #[test]
754 fn strict_walk_skips_lock_and_dotfile_cruft() {
755 let (d, s) = repo();
756 let md = mkit_dir(&d);
757 let (c, _) = commit_one(&s, b"m", b"m", vec![]);
758 write_ref(&md, "refs/heads/main", &c);
759 fs::write(md.join("refs/heads").join(".main.tmp.123.4"), b"garbage").unwrap();
762 let roots = collect_roots(&md).unwrap();
763 assert!(roots.contains(&c), "real ref still collected past cruft");
764 }
765}