use std::collections::HashMap;
use std::io::Read;
use flate2::read::ZlibDecoder;
use sha1::{Digest, Sha1};
use crate::error::{Error, Result};
use crate::objects::{ObjectId, ObjectKind};
use crate::odb::Odb;
#[derive(Debug, Default)]
pub struct UnpackOptions {
pub dry_run: bool,
pub quiet: bool,
}
struct PendingDelta {
offset: usize,
base_oid: Option<ObjectId>,
base_offset: Option<usize>,
delta_data: Vec<u8>,
}
pub fn unpack_objects(reader: &mut dyn Read, odb: &Odb, opts: &UnpackOptions) -> Result<usize> {
let mut raw = Vec::new();
reader.read_to_end(&mut raw).map_err(Error::Io)?;
let mut rd = PackReader::new(raw);
let sig = rd.read_exact(4)?;
if sig != b"PACK" {
return Err(Error::CorruptObject(
"not a pack stream: invalid signature".to_owned(),
));
}
let version = rd.read_u32_be()?;
if version != 2 && version != 3 {
return Err(Error::CorruptObject(format!(
"unsupported pack version {version}"
)));
}
let nr_objects = rd.read_u32_be()? as usize;
let mut by_offset: HashMap<usize, (ObjectKind, Vec<u8>)> = HashMap::new();
let mut by_oid: HashMap<ObjectId, (ObjectKind, Vec<u8>)> = HashMap::new();
let mut pending: Vec<PendingDelta> = Vec::new();
let mut count = 0usize;
for _ in 0..nr_objects {
let obj_offset = rd.pos;
let (type_code, size) = rd.read_type_size()?;
match type_code {
1..=4 => {
let kind = type_code_to_kind(type_code)?;
let data = rd.decompress(size)?;
let oid = write_or_hash(kind, &data, odb, opts.dry_run)?;
by_offset.insert(obj_offset, (kind, data.clone()));
by_oid.insert(oid, (kind, data));
count += 1;
}
6 => {
let neg = rd.read_ofs_neg_offset()?;
let base_offset = obj_offset.checked_sub(neg).ok_or_else(|| {
Error::CorruptObject("ofs-delta base offset underflow".to_owned())
})?;
let delta_data = rd.decompress(size)?;
pending.push(PendingDelta {
offset: obj_offset,
base_oid: None,
base_offset: Some(base_offset),
delta_data,
});
}
7 => {
let base_bytes = rd.read_exact(20)?;
let base_oid = ObjectId::from_bytes(base_bytes)?;
let delta_data = rd.decompress(size)?;
pending.push(PendingDelta {
offset: obj_offset,
base_oid: Some(base_oid),
base_offset: None,
delta_data,
});
}
other => {
return Err(Error::CorruptObject(format!(
"unknown packed-object type {other}"
)))
}
}
}
let consumed = rd.pos;
{
let mut hasher = Sha1::new();
hasher.update(&rd.data[..consumed]);
let digest = hasher.finalize();
let trailing = rd.read_exact(20)?;
if digest.as_slice() != trailing {
return Err(Error::CorruptObject(
"pack trailing checksum mismatch".to_owned(),
));
}
}
let mut remaining = pending;
loop {
if remaining.is_empty() {
break;
}
let before = remaining.len();
let mut still_pending: Vec<PendingDelta> = Vec::new();
for delta in remaining {
let base = if let Some(base_off) = delta.base_offset {
by_offset.get(&base_off).cloned()
} else if let Some(ref base_id) = delta.base_oid {
if let Some(entry) = by_oid.get(base_id) {
Some(entry.clone())
} else if !opts.dry_run {
odb.read(base_id).ok().map(|obj| (obj.kind, obj.data))
} else {
None
}
} else {
None
};
if let Some((base_kind, base_data)) = base {
let result = apply_delta(&base_data, &delta.delta_data)?;
let oid = write_or_hash(base_kind, &result, odb, opts.dry_run)?;
by_offset.insert(delta.offset, (base_kind, result.clone()));
by_oid.insert(oid, (base_kind, result));
count += 1;
} else {
still_pending.push(delta);
}
}
remaining = still_pending;
if remaining.len() == before {
return Err(Error::CorruptObject(format!(
"{} delta(s) could not be resolved",
remaining.len()
)));
}
}
Ok(count)
}
fn write_or_hash(kind: ObjectKind, data: &[u8], odb: &Odb, dry_run: bool) -> Result<ObjectId> {
if dry_run {
Ok(Odb::hash_object_data(kind, data))
} else {
odb.write(kind, data)
}
}
fn type_code_to_kind(code: u8) -> Result<ObjectKind> {
match code {
1 => Ok(ObjectKind::Commit),
2 => Ok(ObjectKind::Tree),
3 => Ok(ObjectKind::Blob),
4 => Ok(ObjectKind::Tag),
_ => Err(Error::CorruptObject(format!(
"type code {code} is not a regular object type"
))),
}
}
struct PackReader {
data: Vec<u8>,
pos: usize,
}
impl PackReader {
fn new(data: Vec<u8>) -> Self {
Self { data, pos: 0 }
}
fn read_exact(&mut self, n: usize) -> Result<&[u8]> {
if self.pos + n > self.data.len() {
return Err(Error::CorruptObject(format!(
"pack stream truncated: need {n} bytes at offset {}",
self.pos
)));
}
let slice = &self.data[self.pos..self.pos + n];
self.pos += n;
Ok(slice)
}
fn read_byte(&mut self) -> Result<u8> {
if self.pos >= self.data.len() {
return Err(Error::CorruptObject(
"unexpected end of pack stream".to_owned(),
));
}
let b = self.data[self.pos];
self.pos += 1;
Ok(b)
}
fn read_u32_be(&mut self) -> Result<u32> {
let bytes = self.read_exact(4)?;
Ok(u32::from_be_bytes(bytes.try_into().map_err(|_| {
Error::CorruptObject("u32 read failed".to_owned())
})?))
}
fn read_type_size(&mut self) -> Result<(u8, usize)> {
let c = self.read_byte()?;
let type_code = (c >> 4) & 0x7;
let mut size = (c & 0x0f) as usize;
let mut shift = 4u32;
let mut cur = c;
while cur & 0x80 != 0 {
cur = self.read_byte()?;
size |= ((cur & 0x7f) as usize) << shift;
shift += 7;
}
Ok((type_code, size))
}
fn read_ofs_neg_offset(&mut self) -> Result<usize> {
let mut c = self.read_byte()?;
let mut value = (c & 0x7f) as usize;
while c & 0x80 != 0 {
c = self.read_byte()?;
value = (value + 1) << 7 | (c & 0x7f) as usize;
}
Ok(value)
}
fn decompress(&mut self, expected_size: usize) -> Result<Vec<u8>> {
let slice = &self.data[self.pos..];
let mut decoder = ZlibDecoder::new(slice);
let mut out = Vec::with_capacity(expected_size);
decoder
.read_to_end(&mut out)
.map_err(|e| Error::Zlib(e.to_string()))?;
if out.len() != expected_size {
return Err(Error::CorruptObject(format!(
"decompressed {} bytes but expected {}",
out.len(),
expected_size
)));
}
self.pos += decoder.total_in() as usize;
Ok(out)
}
}
pub fn apply_delta(base: &[u8], delta: &[u8]) -> Result<Vec<u8>> {
let mut pos = 0usize;
let src_size = read_delta_varint(delta, &mut pos)?;
if src_size != base.len() {
return Err(Error::CorruptObject(format!(
"delta source size {src_size} != base size {}",
base.len()
)));
}
let dest_size = read_delta_varint(delta, &mut pos)?;
let mut result = Vec::with_capacity(dest_size);
while pos < delta.len() {
let cmd = delta[pos];
pos += 1;
if cmd == 0 {
return Err(Error::CorruptObject(
"reserved opcode 0 in delta stream".to_owned(),
));
}
if cmd & 0x80 != 0 {
let mut offset = 0usize;
let mut size = 0usize;
macro_rules! maybe_read_byte {
($flag:expr, $shift:expr, $target:expr) => {
if cmd & $flag != 0 {
let b = *delta.get(pos).ok_or_else(|| {
Error::CorruptObject("truncated delta COPY operand".to_owned())
})?;
pos += 1;
$target |= (b as usize) << $shift;
}
};
}
maybe_read_byte!(0x01, 0, offset);
maybe_read_byte!(0x02, 8, offset);
maybe_read_byte!(0x04, 16, offset);
maybe_read_byte!(0x08, 24, offset);
maybe_read_byte!(0x10, 0, size);
maybe_read_byte!(0x20, 8, size);
maybe_read_byte!(0x40, 16, size);
if size == 0 {
size = 0x10000;
}
let end = offset.checked_add(size).ok_or_else(|| {
Error::CorruptObject("delta COPY range overflows usize".to_owned())
})?;
let chunk = base.get(offset..end).ok_or_else(|| {
Error::CorruptObject(format!(
"delta COPY [{offset},{end}) out of range (base is {} bytes)",
base.len()
))
})?;
result.extend_from_slice(chunk);
} else {
let n = cmd as usize;
let chunk = delta
.get(pos..pos + n)
.ok_or_else(|| Error::CorruptObject("truncated delta INSERT data".to_owned()))?;
result.extend_from_slice(chunk);
pos += n;
}
}
if result.len() != dest_size {
return Err(Error::CorruptObject(format!(
"delta produced {} bytes but expected {dest_size}",
result.len()
)));
}
Ok(result)
}
fn read_delta_varint(data: &[u8], pos: &mut usize) -> Result<usize> {
let mut value = 0usize;
let mut shift = 0u32;
loop {
let b = *data
.get(*pos)
.ok_or_else(|| Error::CorruptObject("truncated delta varint".to_owned()))?;
*pos += 1;
value |= ((b & 0x7f) as usize) << shift;
shift += 7;
if b & 0x80 == 0 {
break;
}
}
Ok(value)
}
#[cfg(test)]
mod tests {
use super::*;
fn make_pack(objects: &[(ObjectKind, &[u8])]) -> Vec<u8> {
use flate2::write::ZlibEncoder;
use std::io::Write;
let mut entries: Vec<Vec<u8>> = Vec::new();
for (kind, data) in objects {
let type_code: u8 = match kind {
ObjectKind::Commit => 1,
ObjectKind::Tree => 2,
ObjectKind::Blob => 3,
ObjectKind::Tag => 4,
};
let mut header = Vec::new();
let mut size = data.len();
let first = ((type_code & 0x7) << 4) as u8 | (size & 0x0f) as u8;
size >>= 4;
if size > 0 {
header.push(first | 0x80);
while size > 0 {
let b = (size & 0x7f) as u8;
size >>= 7;
header.push(if size > 0 { b | 0x80 } else { b });
}
} else {
header.push(first);
}
let mut enc = ZlibEncoder::new(Vec::new(), flate2::Compression::default());
enc.write_all(data).unwrap();
let compressed = enc.finish().unwrap();
let mut entry = header;
entry.extend_from_slice(&compressed);
entries.push(entry);
}
let mut pack = Vec::new();
pack.extend_from_slice(b"PACK");
pack.extend_from_slice(&2u32.to_be_bytes());
pack.extend_from_slice(&(objects.len() as u32).to_be_bytes());
for entry in &entries {
pack.extend_from_slice(entry);
}
let mut hasher = Sha1::new();
hasher.update(&pack);
let digest = hasher.finalize();
pack.extend_from_slice(digest.as_slice());
pack
}
#[test]
fn test_apply_delta_simple() {
let base = b"hello";
let mut delta = Vec::new();
delta.push(5u8);
delta.push(11u8);
delta.push(0x80 | 0x01 | 0x10); delta.push(0u8); delta.push(5u8); delta.push(6u8);
delta.extend_from_slice(b" world");
let result = apply_delta(base, &delta).unwrap();
assert_eq!(result, b"hello world");
}
#[test]
fn test_apply_delta_insert_only() {
let base = b"";
let mut delta = Vec::new();
delta.push(0u8); delta.push(5u8); delta.push(5u8); delta.extend_from_slice(b"hello");
let result = apply_delta(base, &delta).unwrap();
assert_eq!(result, b"hello");
}
#[test]
fn test_apply_delta_copy_only() {
let base = b"abcdef";
let mut delta = Vec::new();
delta.push(6u8); delta.push(3u8); delta.push(0x91u8);
delta.push(2u8); delta.push(3u8);
let result = apply_delta(base, &delta).unwrap();
assert_eq!(result, b"cde");
}
#[test]
fn test_apply_delta_size_zero_means_65536() {
let base = vec![0xABu8; 65536];
let mut delta = Vec::new();
delta.push(0x80 | (65536 & 0x7f) as u8); delta.push(0x80 | ((65536 >> 7) & 0x7f) as u8); delta.push(((65536 >> 14) & 0x7f) as u8); delta.push(0x80 | (65536 & 0x7f) as u8);
delta.push(0x80 | ((65536 >> 7) & 0x7f) as u8);
delta.push(((65536 >> 14) & 0x7f) as u8);
delta.push(0x80u8);
let result = apply_delta(&base, &delta).unwrap();
assert_eq!(result.len(), 65536);
assert!(result.iter().all(|&b| b == 0xAB));
}
#[test]
fn test_unpack_objects_blobs() {
use tempfile::TempDir;
let tmp = TempDir::new().unwrap();
let objects_dir = tmp.path().join("objects");
std::fs::create_dir_all(&objects_dir).unwrap();
let odb = Odb::new(&objects_dir);
let pack = make_pack(&[
(ObjectKind::Blob, b"hello\n"),
(ObjectKind::Blob, b"world\n"),
]);
let opts = UnpackOptions::default();
let count = unpack_objects(&mut pack.as_slice(), &odb, &opts).unwrap();
assert_eq!(count, 2);
let oid1 = Odb::hash_object_data(ObjectKind::Blob, b"hello\n");
let oid2 = Odb::hash_object_data(ObjectKind::Blob, b"world\n");
let obj1 = odb.read(&oid1).unwrap();
let obj2 = odb.read(&oid2).unwrap();
assert_eq!(obj1.data, b"hello\n");
assert_eq!(obj2.data, b"world\n");
}
#[test]
fn test_unpack_objects_dry_run_writes_nothing() {
use tempfile::TempDir;
let tmp = TempDir::new().unwrap();
let objects_dir = tmp.path().join("objects");
std::fs::create_dir_all(&objects_dir).unwrap();
let odb = Odb::new(&objects_dir);
let pack = make_pack(&[(ObjectKind::Blob, b"test content")]);
let opts = UnpackOptions {
dry_run: true,
quiet: true,
};
let count = unpack_objects(&mut pack.as_slice(), &odb, &opts).unwrap();
assert_eq!(count, 1);
let oid = Odb::hash_object_data(ObjectKind::Blob, b"test content");
assert!(!odb.exists(&oid));
}
#[test]
fn test_unpack_objects_bad_signature() {
use tempfile::TempDir;
let tmp = TempDir::new().unwrap();
let objects_dir = tmp.path().join("objects");
std::fs::create_dir_all(&objects_dir).unwrap();
let odb = Odb::new(&objects_dir);
let mut bad = b"NOPE\x00\x00\x00\x02\x00\x00\x00\x00".to_vec();
bad.extend_from_slice(&[0u8; 20]);
let opts = UnpackOptions::default();
let err = unpack_objects(&mut bad.as_slice(), &odb, &opts).unwrap_err();
assert!(err.to_string().contains("invalid signature"));
}
#[test]
fn test_unpack_objects_checksum_mismatch() {
use tempfile::TempDir;
let tmp = TempDir::new().unwrap();
let objects_dir = tmp.path().join("objects");
std::fs::create_dir_all(&objects_dir).unwrap();
let odb = Odb::new(&objects_dir);
let mut pack = make_pack(&[(ObjectKind::Blob, b"data")]);
let n = pack.len();
pack[n - 1] ^= 0xFF;
let opts = UnpackOptions::default();
let err = unpack_objects(&mut pack.as_slice(), &odb, &opts).unwrap_err();
assert!(err.to_string().contains("checksum"));
}
#[test]
fn test_apply_delta_source_size_mismatch() {
let base = b"hi";
let delta = [3u8, 2u8, 2u8, b'h', b'i']; let err = apply_delta(base, &delta).unwrap_err();
assert!(err.to_string().contains("source size"));
}
}