use crate::error::Result;
use crate::hash::Hash;
use crate::object::ObjectType;
use crate::store::Store;
use serde::Serialize;
use std::collections::HashSet;
use std::fs;
#[derive(Debug, Clone, PartialEq, Eq, Serialize)]
pub struct GcStats {
pub objects_deleted: usize,
pub bytes_freed: u64,
}
#[derive(Debug, Clone, PartialEq, Eq, Serialize)]
pub struct OrphanRoot {
pub hash: Hash,
pub object_type: ObjectType,
pub entry_count: Option<usize>,
pub approx_size: u64,
}
type EntryInfo = (Hash, ObjectType, Option<usize>, u64);
type GcResult = (Vec<EntryInfo>, HashSet<Hash>);
impl Store {
pub fn gc(&self, dry_run: bool) -> Result<GcStats> {
let reachable = self.mark_reachable()?;
self.sweep(&reachable, dry_run)
}
pub(crate) fn mark_reachable(&self) -> Result<HashSet<Hash>> {
let mut reachable = HashSet::new();
let refs = self.refs().list()?;
for (_name, hash) in refs {
self.mark_object(&hash, &mut reachable)?;
}
Ok(reachable)
}
fn mark_object(&self, hash: &Hash, reachable: &mut HashSet<Hash>) -> Result<()> {
if reachable.contains(hash) {
return Ok(());
}
let obj_path = self.object_path(hash);
if !obj_path.exists() {
return Ok(());
}
reachable.insert(*hash);
let header = self.read_object_header(&obj_path)?;
match header.object_type {
ObjectType::Tree => {
let tree = self.get_tree(hash)?;
for entry in tree {
self.mark_object(&entry.hash, reachable)?;
}
}
ObjectType::ChunkList => {
use crate::object::ChunkList;
let chunk_list_payload = self.read_object_payload(&obj_path, header.payload_len)?;
let chunk_list = ChunkList::decode(&chunk_list_payload)?;
for chunk_entry in &chunk_list.chunks {
self.mark_object(&chunk_entry.hash, reachable)?;
}
}
ObjectType::Blob => {
}
}
Ok(())
}
fn sweep(&self, reachable: &HashSet<Hash>, dry_run: bool) -> Result<GcStats> {
let mut stats = GcStats {
objects_deleted: 0,
bytes_freed: 0,
};
let objects_dir = self.root().join("objects").join(self.algorithm().as_str());
if !objects_dir.exists() {
return Ok(stats);
}
for shard_entry in fs::read_dir(&objects_dir)? {
let shard_entry = shard_entry?;
let shard_path = shard_entry.path();
if !shard_path.is_dir() {
continue;
}
for obj_entry in fs::read_dir(&shard_path)? {
let obj_entry = obj_entry?;
let obj_path = obj_entry.path();
if !obj_path.is_file() {
continue;
}
let prefix = shard_path
.file_name()
.and_then(|n| n.to_str())
.unwrap_or("");
let suffix = obj_path.file_name().and_then(|n| n.to_str()).unwrap_or("");
let hash_str = format!("{}{}", prefix, suffix);
if let Ok(hash) = Hash::from_hex(&hash_str) {
if !reachable.contains(&hash) {
let metadata = fs::metadata(&obj_path)?;
stats.bytes_freed += metadata.len();
stats.objects_deleted += 1;
if !dry_run {
fs::remove_file(&obj_path)?;
}
}
}
}
if !dry_run
&& let Ok(mut entries) = fs::read_dir(&shard_path)
&& entries.next().is_none()
{
let _ = fs::remove_dir(&shard_path);
}
}
Ok(stats)
}
pub fn find_orphan_roots(&self) -> Result<Vec<OrphanRoot>> {
let reachable = self.mark_reachable()?;
let (unreachable_objects, child_refs) = self.scan_unreachable(&reachable)?;
self.filter_orphan_roots(unreachable_objects, &child_refs)
}
fn scan_unreachable(&self, reachable: &HashSet<Hash>) -> Result<GcResult> {
let mut unreachable_objects = Vec::new();
let mut child_refs = HashSet::new();
let objects_dir = self.root().join("objects").join(self.algorithm().as_str());
if !objects_dir.exists() {
return Ok((unreachable_objects, child_refs));
}
for shard_entry in fs::read_dir(&objects_dir)? {
let shard_entry = shard_entry?;
let shard_path = shard_entry.path();
if !shard_path.is_dir() {
continue;
}
for obj_entry in fs::read_dir(&shard_path)? {
let obj_entry = obj_entry?;
let obj_path = obj_entry.path();
if !obj_path.is_file() {
continue;
}
let prefix = shard_path
.file_name()
.and_then(|n| n.to_str())
.unwrap_or("");
let suffix = obj_path.file_name().and_then(|n| n.to_str()).unwrap_or("");
let hash_str = format!("{}{}", prefix, suffix);
if let Ok(hash) = Hash::from_hex(&hash_str) {
if reachable.contains(&hash) {
continue;
}
if let Ok(header) = self.read_object_header(&obj_path) {
let size = fs::metadata(&obj_path)?.len();
match header.object_type {
ObjectType::Tree => {
if let Ok(entries) = self.get_tree(&hash) {
let entry_count = entries.len();
for entry in &entries {
child_refs.insert(entry.hash);
}
unreachable_objects.push((
hash,
ObjectType::Tree,
Some(entry_count),
size,
));
}
}
ObjectType::Blob => {
unreachable_objects.push((hash, ObjectType::Blob, None, size));
}
ObjectType::ChunkList => {
}
}
}
}
}
}
Ok((unreachable_objects, child_refs))
}
fn filter_orphan_roots(
&self,
unreachable_objects: Vec<(Hash, ObjectType, Option<usize>, u64)>,
child_refs: &HashSet<Hash>,
) -> Result<Vec<OrphanRoot>> {
let mut orphan_roots = Vec::new();
for (hash, object_type, entry_count, approx_size) in unreachable_objects {
if !child_refs.contains(&hash) {
orphan_roots.push(OrphanRoot {
hash,
object_type,
entry_count,
approx_size,
});
}
}
Ok(orphan_roots)
}
}
#[cfg(test)]
mod tests {
use super::*;
use crate::hash::Algorithm;
use tempfile::TempDir;
#[test]
fn test_gc_empty_store() {
let temp_dir = TempDir::new().unwrap();
let store = Store::init(temp_dir.path(), Algorithm::Blake3).unwrap();
let stats = store.gc(false).unwrap();
assert_eq!(stats.objects_deleted, 0);
assert_eq!(stats.bytes_freed, 0);
}
#[test]
fn test_gc_with_ref() {
let temp_dir = TempDir::new().unwrap();
let store = Store::init(temp_dir.path(), Algorithm::Blake3).unwrap();
let hash = store.put_blob(b"test data".as_ref()).unwrap();
store.refs().add("myref", &hash).unwrap();
let stats = store.gc(false).unwrap();
assert_eq!(stats.objects_deleted, 0);
}
#[test]
fn test_gc_unreferenced_blob() {
let temp_dir = TempDir::new().unwrap();
let store = Store::init(temp_dir.path(), Algorithm::Blake3).unwrap();
let hash = store.put_blob(b"orphan data".as_ref()).unwrap();
assert!(store.object_path(&hash).exists());
let stats = store.gc(false).unwrap();
assert_eq!(stats.objects_deleted, 1);
assert!(stats.bytes_freed > 0);
assert!(!store.object_path(&hash).exists());
}
#[test]
fn test_gc_dry_run() {
let temp_dir = TempDir::new().unwrap();
let store = Store::init(temp_dir.path(), Algorithm::Blake3).unwrap();
let hash = store.put_blob(b"orphan".as_ref()).unwrap();
let stats = store.gc(true).unwrap();
assert_eq!(stats.objects_deleted, 1);
assert!(stats.bytes_freed > 0);
assert!(store.object_path(&hash).exists());
let stats2 = store.gc(false).unwrap();
assert_eq!(stats2.objects_deleted, 1);
assert!(!store.object_path(&hash).exists());
}
#[test]
fn test_gc_tree_reachability() {
use crate::tree::{EntryType, TreeEntry, file_modes};
let temp_dir = TempDir::new().unwrap();
let store = Store::init(temp_dir.path(), Algorithm::Blake3).unwrap();
let blob1 = store.put_blob(b"file1".as_ref()).unwrap();
let blob2 = store.put_blob(b"file2".as_ref()).unwrap();
let orphan = store.put_blob(b"orphan".as_ref()).unwrap();
let entries = vec![
TreeEntry::new(
EntryType::Blob,
file_modes::REGULAR,
blob1,
"file1".to_string(),
)
.unwrap(),
TreeEntry::new(
EntryType::Blob,
file_modes::REGULAR,
blob2,
"file2".to_string(),
)
.unwrap(),
];
let tree = store.put_tree(entries).unwrap();
store.refs().add("mytree", &tree).unwrap();
let stats = store.gc(false).unwrap();
assert_eq!(stats.objects_deleted, 1);
assert!(store.object_path(&tree).exists());
assert!(store.object_path(&blob1).exists());
assert!(store.object_path(&blob2).exists());
assert!(!store.object_path(&orphan).exists());
}
#[test]
fn test_gc_after_ref_removed() {
let temp_dir = TempDir::new().unwrap();
let store = Store::init(temp_dir.path(), Algorithm::Blake3).unwrap();
let hash = store.put_blob(b"data".as_ref()).unwrap();
store.refs().add("ref1", &hash).unwrap();
let stats = store.gc(false).unwrap();
assert_eq!(stats.objects_deleted, 0);
store.refs().remove("ref1").unwrap();
let stats = store.gc(false).unwrap();
assert_eq!(stats.objects_deleted, 1);
assert!(!store.object_path(&hash).exists());
}
#[test]
fn test_find_orphans_empty_store() {
let temp_dir = TempDir::new().unwrap();
let store = Store::init(temp_dir.path(), Algorithm::Blake3).unwrap();
let orphans = store.find_orphan_roots().unwrap();
assert_eq!(orphans.len(), 0);
}
#[test]
fn test_find_orphans_unreferenced_tree() {
use crate::tree::{EntryType, TreeEntry, file_modes};
let temp_dir = TempDir::new().unwrap();
let store = Store::init(temp_dir.path(), Algorithm::Blake3).unwrap();
let blob = store.put_blob(b"file content".as_ref()).unwrap();
let entries = vec![
TreeEntry::new(
EntryType::Blob,
file_modes::REGULAR,
blob,
"file.txt".to_string(),
)
.unwrap(),
];
let tree_hash = store.put_tree(entries).unwrap();
let orphans = store.find_orphan_roots().unwrap();
assert_eq!(orphans.len(), 1);
assert_eq!(orphans[0].hash, tree_hash);
assert_eq!(orphans[0].object_type, ObjectType::Tree);
assert_eq!(orphans[0].entry_count, Some(1));
assert!(orphans[0].approx_size > 0);
}
#[test]
fn test_find_orphans_referenced_tree() {
use crate::tree::{EntryType, TreeEntry, file_modes};
let temp_dir = TempDir::new().unwrap();
let store = Store::init(temp_dir.path(), Algorithm::Blake3).unwrap();
let blob = store.put_blob(b"file content".as_ref()).unwrap();
let entries = vec![
TreeEntry::new(
EntryType::Blob,
file_modes::REGULAR,
blob,
"file.txt".to_string(),
)
.unwrap(),
];
let tree_hash = store.put_tree(entries).unwrap();
store.refs().add("mytree", &tree_hash).unwrap();
let orphans = store.find_orphan_roots().unwrap();
assert_eq!(orphans.len(), 0);
}
#[test]
fn test_find_orphans_nested_unreferenced_trees() {
use crate::tree::{EntryType, TreeEntry, file_modes};
let temp_dir = TempDir::new().unwrap();
let store = Store::init(temp_dir.path(), Algorithm::Blake3).unwrap();
let blob = store.put_blob(b"nested file".as_ref()).unwrap();
let subtree_entries = vec![
TreeEntry::new(
EntryType::Blob,
file_modes::REGULAR,
blob,
"nested.txt".to_string(),
)
.unwrap(),
];
let subtree_hash = store.put_tree(subtree_entries).unwrap();
let parent_entries = vec![
TreeEntry::new(
EntryType::Tree,
file_modes::DIRECTORY,
subtree_hash,
"subdir".to_string(),
)
.unwrap(),
];
let parent_hash = store.put_tree(parent_entries).unwrap();
let orphans = store.find_orphan_roots().unwrap();
assert_eq!(orphans.len(), 1);
assert_eq!(orphans[0].hash, parent_hash);
assert_eq!(orphans[0].object_type, ObjectType::Tree);
}
#[test]
fn test_find_orphans_includes_blobs() {
let temp_dir = TempDir::new().unwrap();
let store = Store::init(temp_dir.path(), Algorithm::Blake3).unwrap();
let hash1 = store.put_blob(b"orphan1".as_ref()).unwrap();
let hash2 = store.put_blob(b"orphan2".as_ref()).unwrap();
let orphans = store.find_orphan_roots().unwrap();
assert_eq!(orphans.len(), 2);
assert!(
orphans
.iter()
.any(|o| o.hash == hash1 && o.object_type == ObjectType::Blob)
);
assert!(
orphans
.iter()
.any(|o| o.hash == hash2 && o.object_type == ObjectType::Blob)
);
assert!(orphans.iter().all(|o| o.entry_count.is_none()));
}
#[test]
fn test_find_orphans_mixed_blobs_and_trees() {
use crate::tree::{EntryType, TreeEntry, file_modes};
let temp_dir = TempDir::new().unwrap();
let store = Store::init(temp_dir.path(), Algorithm::Blake3).unwrap();
let blob1 = store.put_blob(b"orphan blob 1".as_ref()).unwrap();
let blob2 = store.put_blob(b"orphan blob 2".as_ref()).unwrap();
let tree_blob = store.put_blob(b"tree content".as_ref()).unwrap();
let tree_hash = store
.put_tree(vec![
TreeEntry::new(
EntryType::Blob,
file_modes::REGULAR,
tree_blob,
"file.txt".to_string(),
)
.unwrap(),
])
.unwrap();
let orphans = store.find_orphan_roots().unwrap();
assert_eq!(orphans.len(), 3);
assert!(
orphans
.iter()
.any(|o| o.hash == blob1 && o.object_type == ObjectType::Blob)
);
assert!(
orphans
.iter()
.any(|o| o.hash == blob2 && o.object_type == ObjectType::Blob)
);
assert!(
orphans
.iter()
.any(|o| o.hash == tree_hash && o.object_type == ObjectType::Tree)
);
let blob_orphans: Vec<_> = orphans
.iter()
.filter(|o| o.object_type == ObjectType::Blob)
.collect();
assert_eq!(blob_orphans.len(), 2);
assert!(blob_orphans.iter().all(|o| o.entry_count.is_none()));
let tree_orphans: Vec<_> = orphans
.iter()
.filter(|o| o.object_type == ObjectType::Tree)
.collect();
assert_eq!(tree_orphans.len(), 1);
assert_eq!(tree_orphans[0].entry_count, Some(1));
}
#[test]
fn test_find_orphans_multiple_orphan_roots() {
use crate::tree::{EntryType, TreeEntry, file_modes};
let temp_dir = TempDir::new().unwrap();
let store = Store::init(temp_dir.path(), Algorithm::Blake3).unwrap();
let blob1 = store.put_blob(b"file1".as_ref()).unwrap();
let tree1 = store
.put_tree(vec![
TreeEntry::new(
EntryType::Blob,
file_modes::REGULAR,
blob1,
"file1.txt".to_string(),
)
.unwrap(),
])
.unwrap();
let blob2 = store.put_blob(b"file2".as_ref()).unwrap();
let tree2 = store
.put_tree(vec![
TreeEntry::new(
EntryType::Blob,
file_modes::REGULAR,
blob2,
"file2.txt".to_string(),
)
.unwrap(),
])
.unwrap();
let orphans = store.find_orphan_roots().unwrap();
assert_eq!(orphans.len(), 2);
let orphan_hashes: Vec<_> = orphans.iter().map(|o| o.hash).collect();
assert!(orphan_hashes.contains(&tree1));
assert!(orphan_hashes.contains(&tree2));
}
use proptest::prelude::*;
proptest! {
#![proptest_config(ProptestConfig {
cases: 32, // Expensive tests - reduced case count
max_shrink_iters: 5000,
..ProptestConfig::default()
})]
#[test]
fn prop_gc_preserves_referenced(_seed in any::<u64>()) {
let temp_dir = TempDir::new().unwrap();
let store = Store::init(temp_dir.path(), Algorithm::Blake3)?;
let ref_data = b"referenced content";
let referenced_hash = store.put_blob(&ref_data[..])?;
store.refs().add("test-ref", &referenced_hash)?;
let unref_data = b"unreferenced content";
let _unreferenced_hash = store.put_blob(&unref_data[..])?;
let stats = store.gc(false)?;
prop_assert!(
store.get_blob(&referenced_hash).is_ok(),
"GC deleted a referenced object"
);
prop_assert!(
stats.objects_deleted > 0,
"GC should delete unreferenced objects"
);
}
#[test]
fn prop_gc_deletes_unreferenced(_seed in any::<u64>()) {
let temp_dir = TempDir::new().unwrap();
let store = Store::init(temp_dir.path(), Algorithm::Blake3)?;
let ref_data = b"referenced";
let referenced_hash = store.put_blob(&ref_data[..])?;
store.refs().add("keep", &referenced_hash)?;
let unref_data = b"unreferenced";
let unreferenced_hash = store.put_blob(&unref_data[..])?;
store.gc(false)?;
prop_assert!(
store.get_blob(&unreferenced_hash).is_err(),
"GC failed to delete unreferenced object"
);
prop_assert!(
store.get_blob(&referenced_hash).is_ok(),
"GC deleted referenced object"
);
}
#[test]
fn prop_gc_idempotent(_seed in any::<u64>()) {
let temp_dir = TempDir::new().unwrap();
let store = Store::init(temp_dir.path(), Algorithm::Blake3)?;
let ref_hash = store.put_blob(b"referenced".as_ref())?;
store.refs().add("keep", &ref_hash)?;
store.put_blob(b"unreferenced".as_ref())?;
let stats1 = store.gc(false)?;
let stats2 = store.gc(false)?;
prop_assert_eq!(
stats2.objects_deleted,
0,
"GC is not idempotent - deleted objects on second run"
);
prop_assert!(
stats1.objects_deleted > 0,
"First GC run should delete unreferenced objects"
);
}
}
}