use std::collections::{BTreeMap, HashSet};
use std::str;
use gix::Repository;
use gix::object::Kind;
use gix_hash::ObjectId;
use crate::git::{PeeledTip, Sha};
use super::PackchainError;
use super::schema::{PathIndex, PathNode, Sha40};
pub(crate) fn extract_path_index(
repo: &Repository,
peeled: &PeeledTip,
unpeeled_tip: Sha,
) -> Result<Option<PathIndex>, PackchainError> {
let tree_id = match peeled {
PeeledTip::Commit { commit, .. } => repo
.find_object(*commit.as_object_id())
.map_err(crate::git::GitError::from)?
.peel_to_kind(Kind::Commit)
.map_err(crate::git::GitError::from)?
.into_commit()
.tree_id()
.map_err(crate::git::GitError::from)?
.detach(),
PeeledTip::Tree { tree, .. } => *tree,
PeeledTip::Blob { .. } => return Ok(None),
};
let mut root: BTreeMap<String, PathNode> = BTreeMap::new();
let mut ancestors: HashSet<ObjectId> = HashSet::new();
walk_tree(repo, tree_id, &mut root, &mut ancestors)?;
Ok(Some(PathIndex {
v: PathIndex::SCHEMA_VERSION,
tip: Sha40::from_oid(unpeeled_tip.as_object_id())?,
tree: root,
}))
}
pub(crate) fn enumerate_tree_closure(
repo: &Repository,
tree: ObjectId,
) -> Result<Vec<ObjectId>, crate::git::GitError> {
use gix::objs::tree::EntryKind;
let mut oids = Vec::new();
let mut visited: HashSet<ObjectId> = HashSet::new();
let mut stack = vec![tree];
while let Some(current) = stack.pop() {
if !visited.insert(current) {
continue;
}
oids.push(current);
let object = repo.find_object(current)?.peel_to_kind(Kind::Tree)?;
for entry in object.into_tree().iter() {
let entry = entry?;
match entry.kind() {
EntryKind::Tree => stack.push(entry.oid().to_owned()),
EntryKind::Blob | EntryKind::BlobExecutable | EntryKind::Link => {
let oid = entry.oid().to_owned();
if visited.insert(oid) {
oids.push(oid);
}
}
EntryKind::Commit => {
}
}
}
}
Ok(oids)
}
fn walk_tree(
repo: &Repository,
tree_id: ObjectId,
out: &mut BTreeMap<String, PathNode>,
ancestors: &mut HashSet<ObjectId>,
) -> Result<(), PackchainError> {
use gix::objs::tree::EntryKind;
if !ancestors.insert(tree_id) {
return Err(PackchainError::TreeCycle {
oid: tree_id.to_string(),
});
}
let tree = repo
.find_object(tree_id)
.map_err(crate::git::GitError::from)?
.peel_to_kind(Kind::Tree)
.map_err(crate::git::GitError::from)?
.into_tree();
for entry in tree.iter() {
let entry = entry.map_err(crate::git::GitError::from)?;
let filename = entry.filename();
let name = str::from_utf8(filename).map_err(|_| PackchainError::InvalidPath {
bytes: filename.to_vec(),
})?;
match entry.kind() {
EntryKind::Tree => {
let mut subtree: BTreeMap<String, PathNode> = BTreeMap::new();
walk_tree(repo, entry.oid().to_owned(), &mut subtree, ancestors)?;
out.insert(name.to_owned(), PathNode::Tree(subtree));
}
EntryKind::Blob | EntryKind::BlobExecutable | EntryKind::Link => {
let sha = Sha40::from_oid(entry.oid())?;
out.insert(name.to_owned(), PathNode::Blob(sha));
}
EntryKind::Commit => {
}
}
}
ancestors.remove(&tree_id);
Ok(())
}
#[cfg(test)]
mod tests {
use super::*;
use gix::actor::SignatureRef;
use gix::bstr::BStr;
use gix::objs::tree::{Entry, EntryKind};
use tempfile::TempDir;
fn signature() -> SignatureRef<'static> {
SignatureRef {
name: BStr::new("Tester"),
email: BStr::new("t@example.com"),
time: "0 +0000",
}
}
fn fixture_repo() -> (gix::Repository, TempDir, Sha) {
let tmp = TempDir::new().unwrap();
let repo = gix::init(tmp.path()).unwrap();
let cargo = repo.write_blob(b"cargo body").unwrap().detach();
let main_rs = repo.write_blob(b"fn main(){}").unwrap().detach();
let deep = repo.write_blob(b"// deep").unwrap().detach();
let inner_tree = repo
.write_object(&gix::objs::Tree {
entries: vec![Entry {
mode: EntryKind::Blob.into(),
filename: "deep.rs".into(),
oid: deep,
}],
})
.unwrap()
.detach();
let src_tree = repo
.write_object(&gix::objs::Tree {
entries: vec![
Entry {
mode: EntryKind::Tree.into(),
filename: "inner".into(),
oid: inner_tree,
},
Entry {
mode: EntryKind::Blob.into(),
filename: "main.rs".into(),
oid: main_rs,
},
],
})
.unwrap()
.detach();
let root_tree = repo
.write_object(&gix::objs::Tree {
entries: vec![
Entry {
mode: EntryKind::Blob.into(),
filename: "Cargo.toml".into(),
oid: cargo,
},
Entry {
mode: EntryKind::Tree.into(),
filename: "src".into(),
oid: src_tree,
},
],
})
.unwrap()
.detach();
let commit = repo
.commit_as(
signature(),
signature(),
"refs/heads/main",
"initial",
root_tree,
std::iter::empty::<ObjectId>(),
)
.unwrap()
.detach();
let tip = Sha::from_object_id(commit);
(repo, tmp, tip)
}
fn peeled_commit(repo: &gix::Repository, tip: Sha) -> PeeledTip {
crate::git::peel_tag_chain(repo, tip).expect("peel commit-tip")
}
#[test]
fn extract_path_index_reflects_nested_layout() {
let (repo, _guard, tip) = fixture_repo();
let peeled = peeled_commit(&repo, tip);
let index = extract_path_index(&repo, &peeled, tip)
.expect("extract")
.expect("commit-tip path-index must be present");
assert_eq!(index.v, PathIndex::SCHEMA_VERSION);
assert_eq!(index.tip.as_str(), tip.to_string());
assert_eq!(index.tree.len(), 2);
assert!(matches!(
index.tree.get("Cargo.toml"),
Some(PathNode::Blob(_))
));
let src = index.tree.get("src").expect("src present");
let PathNode::Tree(src_children) = src else {
panic!("expected src to be a Tree, got {src:?}");
};
assert_eq!(src_children.len(), 2);
assert!(matches!(
src_children.get("main.rs"),
Some(PathNode::Blob(_)),
));
let inner = src_children.get("inner").expect("inner present");
let PathNode::Tree(inner_children) = inner else {
panic!("expected inner to be a Tree, got {inner:?}");
};
assert_eq!(inner_children.len(), 1);
assert!(matches!(
inner_children.get("deep.rs"),
Some(PathNode::Blob(_)),
));
}
#[test]
fn extract_path_index_round_trips_via_json() {
let (repo, _guard, tip) = fixture_repo();
let peeled = peeled_commit(&repo, tip);
let index = extract_path_index(&repo, &peeled, tip).unwrap().unwrap();
let bytes = index.to_json_pretty().unwrap();
let decoded = PathIndex::from_json_bytes(&bytes).unwrap();
assert_eq!(decoded, index);
}
#[test]
fn extract_path_index_for_tree_tip_walks_tree_directly() {
let (repo, _guard, tip) = fixture_repo();
let root_tree = repo
.find_object(*tip.as_object_id())
.unwrap()
.peel_to_kind(Kind::Commit)
.unwrap()
.into_commit()
.tree_id()
.unwrap()
.detach();
let peeled = PeeledTip::Tree {
tree: root_tree,
tag_chain: Vec::new(),
};
let unpeeled = Sha::from_object_id(root_tree);
let index = extract_path_index(&repo, &peeled, unpeeled)
.unwrap()
.expect("tree-tip path-index must be present");
assert_eq!(index.tip.as_str(), unpeeled.to_string());
assert!(index.tree.contains_key("Cargo.toml"));
assert!(index.tree.contains_key("src"));
}
#[test]
fn extract_path_index_for_blob_tip_returns_none() {
let tmp = TempDir::new().unwrap();
let repo = gix::init(tmp.path()).unwrap();
let blob = repo.write_blob(b"data").unwrap().detach();
let peeled = PeeledTip::Blob {
blob,
tag_chain: Vec::new(),
};
let result = extract_path_index(&repo, &peeled, Sha::from_object_id(blob)).unwrap();
assert!(result.is_none(), "blob-tipped chains have no tree to index");
}
#[test]
fn extract_path_index_rejects_non_utf8_filename() {
let tmp = TempDir::new().unwrap();
let repo = gix::init(tmp.path()).unwrap();
let blob = repo.write_blob(b"x").unwrap().detach();
let filename = gix::bstr::BString::from(vec![b'a', 0x80, b'b']);
let tree = repo
.write_object(&gix::objs::Tree {
entries: vec![Entry {
mode: EntryKind::Blob.into(),
filename,
oid: blob,
}],
})
.unwrap()
.detach();
let commit = repo
.commit_as(
signature(),
signature(),
"refs/heads/bad",
"non-utf8 filename",
tree,
std::iter::empty::<ObjectId>(),
)
.unwrap()
.detach();
let tip = Sha::from_object_id(commit);
let peeled = peeled_commit(&repo, tip);
let err =
extract_path_index(&repo, &peeled, tip).expect_err("non-UTF-8 filename must reject");
assert!(
matches!(err, PackchainError::InvalidPath { ref bytes } if bytes == &[b'a', 0x80, b'b']),
"expected InvalidPath with offending bytes, got {err:?}",
);
let msg = err.to_string();
assert!(
msg.starts_with("invalid path: a"),
"expected lossy-UTF-8 diagnostic, got {msg}",
);
}
#[test]
fn extract_path_index_keeps_paths_for_existing_repo() {
let (_repo_inmem, guard, tip) = fixture_repo();
let opened = gix::open(guard.path()).expect("re-open");
let peeled = peeled_commit(&opened, tip);
let index = extract_path_index(&opened, &peeled, tip)
.expect("extract on opened")
.expect("commit-tip path-index must be present");
assert!(index.tree.contains_key("Cargo.toml"));
assert!(index.tree.contains_key("src"));
}
#[test]
fn enumerate_tree_closure_yields_tree_subtree_and_blob_oids() {
let (repo, _guard, tip) = fixture_repo();
let root_tree = repo
.find_object(*tip.as_object_id())
.unwrap()
.peel_to_kind(Kind::Commit)
.unwrap()
.into_commit()
.tree_id()
.unwrap()
.detach();
let oids = enumerate_tree_closure(&repo, root_tree).unwrap();
assert_eq!(oids.len(), 6, "fixture has 3 trees + 3 blobs, got {oids:?}");
assert!(oids.contains(&root_tree), "root tree must be included");
}
#[test]
fn enumerate_tree_closure_handles_empty_tree() {
let tmp = TempDir::new().unwrap();
let repo = gix::init(tmp.path()).unwrap();
let empty_tree = repo
.write_object(&gix::objs::Tree {
entries: Vec::new(),
})
.unwrap()
.detach();
let oids = enumerate_tree_closure(&repo, empty_tree).unwrap();
assert_eq!(oids, vec![empty_tree]);
}
#[test]
fn enumerate_tree_closure_skips_gitlink_entries() {
let tmp = TempDir::new().unwrap();
let repo = gix::init(tmp.path()).unwrap();
let gitlink_oid = ObjectId::from_hex(b"0123456789abcdef0123456789abcdef01234567").unwrap();
let blob = repo.write_blob(b"x").unwrap().detach();
let tree = repo
.write_object(&gix::objs::Tree {
entries: vec![
Entry {
mode: EntryKind::Commit.into(),
filename: "submod".into(),
oid: gitlink_oid,
},
Entry {
mode: EntryKind::Blob.into(),
filename: "x".into(),
oid: blob,
},
],
})
.unwrap()
.detach();
let oids = enumerate_tree_closure(&repo, tree).unwrap();
assert!(oids.contains(&tree), "tree itself must be included");
assert!(oids.contains(&blob), "blob entry must be included");
assert!(
!oids.contains(&gitlink_oid),
"gitlink OID must be skipped (lives in another repo)",
);
}
#[test]
fn enumerate_tree_closure_dedupes_shared_blob() {
let tmp = TempDir::new().unwrap();
let repo = gix::init(tmp.path()).unwrap();
let blob = repo.write_blob(b"shared").unwrap().detach();
let tree = repo
.write_object(&gix::objs::Tree {
entries: vec![
Entry {
mode: EntryKind::Blob.into(),
filename: "a".into(),
oid: blob,
},
Entry {
mode: EntryKind::Blob.into(),
filename: "b".into(),
oid: blob,
},
],
})
.unwrap()
.detach();
let oids = enumerate_tree_closure(&repo, tree).unwrap();
assert_eq!(
oids.len(),
2,
"shared blob must be emitted exactly once (got {oids:?})",
);
assert!(oids.contains(&tree));
assert!(oids.contains(&blob));
}
#[test]
fn enumerate_tree_closure_dedupes_shared_subtree() {
let tmp = TempDir::new().unwrap();
let repo = gix::init(tmp.path()).unwrap();
let blob = repo.write_blob(b"leaf").unwrap().detach();
let subtree = repo
.write_object(&gix::objs::Tree {
entries: vec![Entry {
mode: EntryKind::Blob.into(),
filename: "leaf.txt".into(),
oid: blob,
}],
})
.unwrap()
.detach();
let root = repo
.write_object(&gix::objs::Tree {
entries: vec![
Entry {
mode: EntryKind::Tree.into(),
filename: "left".into(),
oid: subtree,
},
Entry {
mode: EntryKind::Tree.into(),
filename: "right".into(),
oid: subtree,
},
],
})
.unwrap()
.detach();
let oids = enumerate_tree_closure(&repo, root).unwrap();
assert_eq!(
oids.len(),
3,
"root + shared subtree (once) + shared blob (once); got {oids:?}",
);
assert!(oids.contains(&root));
assert!(oids.contains(&subtree));
assert!(oids.contains(&blob));
}
#[test]
fn enumerate_tree_closure_includes_symlink_blobs() {
let tmp = TempDir::new().unwrap();
let repo = gix::init(tmp.path()).unwrap();
let target = repo.write_blob(b"target/path").unwrap().detach();
let tree = repo
.write_object(&gix::objs::Tree {
entries: vec![Entry {
mode: EntryKind::Link.into(),
filename: "alias".into(),
oid: target,
}],
})
.unwrap()
.detach();
let oids = enumerate_tree_closure(&repo, tree).unwrap();
assert!(oids.contains(&tree));
assert!(
oids.contains(&target),
"symlink target blob must be included"
);
}
fn write_corrupt_loose_tree(
repo_path: &std::path::Path,
oid: ObjectId,
entries: &[(EntryKind, &str, ObjectId)],
) {
use flate2::Compression;
use flate2::write::ZlibEncoder;
use std::io::Write as _;
let mut body: Vec<u8> = Vec::new();
for (kind, name, entry_oid) in entries {
let mode: u32 = match kind {
EntryKind::Tree => 0o040_000,
EntryKind::Blob => 0o100_644,
EntryKind::BlobExecutable => 0o100_755,
EntryKind::Link => 0o120_000,
EntryKind::Commit => 0o160_000,
};
body.extend_from_slice(format!("{mode:o}").as_bytes());
body.push(b' ');
body.extend_from_slice(name.as_bytes());
body.push(0);
body.extend_from_slice(entry_oid.as_slice());
}
let mut full: Vec<u8> = Vec::new();
full.extend_from_slice(format!("tree {}", body.len()).as_bytes());
full.push(0);
full.extend_from_slice(&body);
let mut encoder = ZlibEncoder::new(Vec::new(), Compression::default());
encoder.write_all(&full).unwrap();
let compressed = encoder.finish().unwrap();
let hex = oid.to_string();
let dir = repo_path.join(".git/objects").join(&hex[..2]);
std::fs::create_dir_all(&dir).unwrap();
std::fs::write(dir.join(&hex[2..]), compressed).unwrap();
}
#[test]
fn extract_path_index_detects_direct_self_cycle() {
let tmp = TempDir::new().unwrap();
let repo = gix::init(tmp.path()).unwrap();
let cyclic = ObjectId::from_hex(b"1111111111111111111111111111111111111111").unwrap();
write_corrupt_loose_tree(tmp.path(), cyclic, &[(EntryKind::Tree, "self", cyclic)]);
let peeled = PeeledTip::Tree {
tree: cyclic,
tag_chain: Vec::new(),
};
let unpeeled = Sha::from_object_id(cyclic);
let err = extract_path_index(&repo, &peeled, unpeeled)
.expect_err("self-referential tree must be rejected as a cycle");
match err {
PackchainError::TreeCycle { oid } => {
assert_eq!(oid, cyclic.to_string());
}
other => panic!("expected TreeCycle, got {other:?}"),
}
}
#[test]
fn extract_path_index_detects_indirect_cycle() {
let tmp = TempDir::new().unwrap();
let repo = gix::init(tmp.path()).unwrap();
let t1 = ObjectId::from_hex(b"2222222222222222222222222222222222222222").unwrap();
let t2 = ObjectId::from_hex(b"3333333333333333333333333333333333333333").unwrap();
write_corrupt_loose_tree(tmp.path(), t1, &[(EntryKind::Tree, "down", t2)]);
write_corrupt_loose_tree(tmp.path(), t2, &[(EntryKind::Tree, "back", t1)]);
let peeled = PeeledTip::Tree {
tree: t1,
tag_chain: Vec::new(),
};
let unpeeled = Sha::from_object_id(t1);
let err = extract_path_index(&repo, &peeled, unpeeled)
.expect_err("indirect tree cycle must be rejected");
match err {
PackchainError::TreeCycle { oid } => {
assert_eq!(oid, t1.to_string());
}
other => panic!("expected TreeCycle, got {other:?}"),
}
}
#[test]
fn extract_path_index_walks_shared_subtree_at_distinct_paths() {
let tmp = TempDir::new().unwrap();
let repo = gix::init(tmp.path()).unwrap();
let leaf = repo.write_blob(b"hello").unwrap().detach();
let shared = repo
.write_object(&gix::objs::Tree {
entries: vec![Entry {
mode: EntryKind::Blob.into(),
filename: "leaf.txt".into(),
oid: leaf,
}],
})
.unwrap()
.detach();
let root = repo
.write_object(&gix::objs::Tree {
entries: vec![
Entry {
mode: EntryKind::Tree.into(),
filename: "src".into(),
oid: shared,
},
Entry {
mode: EntryKind::Tree.into(),
filename: "vendor".into(),
oid: shared,
},
],
})
.unwrap()
.detach();
let peeled = PeeledTip::Tree {
tree: root,
tag_chain: Vec::new(),
};
let unpeeled = Sha::from_object_id(root);
let index = extract_path_index(&repo, &peeled, unpeeled)
.expect("shared-subtree walk must succeed")
.expect("tree-tip path-index must be present");
let src = index.tree.get("src").expect("src present");
let vendor = index.tree.get("vendor").expect("vendor present");
let PathNode::Tree(src_children) = src else {
panic!("src must be a Tree, got {src:?}");
};
let PathNode::Tree(vendor_children) = vendor else {
panic!("vendor must be a Tree, got {vendor:?}");
};
assert_eq!(
src_children, vendor_children,
"shared subtree must yield identical child maps at both paths",
);
assert!(matches!(
src_children.get("leaf.txt"),
Some(PathNode::Blob(_)),
));
}
fn synthetic_wide_tree(dirs: usize, files_per_dir: usize) -> (gix::Repository, TempDir, Sha) {
let tmp = TempDir::new().unwrap();
let repo = gix::init(tmp.path()).unwrap();
let blob = repo.write_blob(b"x").unwrap().detach();
let mut root_entries = Vec::with_capacity(dirs);
for d in 0..dirs {
let mut dir_entries = Vec::with_capacity(files_per_dir);
for f in 0..files_per_dir {
dir_entries.push(Entry {
mode: EntryKind::Blob.into(),
filename: format!("file-{f:05}.bin").into(),
oid: blob,
});
}
let dir_tree = repo
.write_object(&gix::objs::Tree {
entries: dir_entries,
})
.unwrap()
.detach();
root_entries.push(Entry {
mode: EntryKind::Tree.into(),
filename: format!("dir-{d:05}").into(),
oid: dir_tree,
});
}
let root = repo
.write_object(&gix::objs::Tree {
entries: root_entries,
})
.unwrap()
.detach();
(repo, tmp, Sha::from_object_id(root))
}
#[test]
#[ignore = "perf reproducer for issue #96 — run with --ignored"]
fn walk_tree_synthetic_wide_tree_perf_reproducer() {
use std::time::Instant;
let dirs = 5_000;
let files_per_dir = 16;
let (repo, _guard, root) = synthetic_wide_tree(dirs, files_per_dir);
let peeled = PeeledTip::Tree {
tree: *root.as_object_id(),
tag_chain: Vec::new(),
};
let iterations = 5;
let mut total = std::time::Duration::ZERO;
for _ in 0..iterations {
let start = Instant::now();
let index = extract_path_index(&repo, &peeled, root)
.expect("perf reproducer must succeed")
.expect("tree-tip path-index must be present");
total += start.elapsed();
assert_eq!(index.tree.len(), dirs);
}
let avg = total / iterations;
eprintln!(
"extract_path_index over {dirs} dirs x {files_per_dir} blobs/dir \
({} entries total): avg {:.2?} per call, {iterations} iterations",
dirs * files_per_dir + dirs,
avg,
);
assert!(
avg < std::time::Duration::from_secs(2),
"extract_path_index avg {avg:.2?} exceeds 2 s upper bound — \
suspect a quadratic regression in walk_tree",
);
}
}