hashtree_git/
pack.rs

1//! Git packfile generation and parsing
2//!
3//! Packfiles are git's binary format for efficiently transferring objects.
4//! Format: PACK header, N objects, SHA-1 checksum
5
6use sha1::{Sha1, Digest};
7use flate2::write::ZlibEncoder;
8use flate2::read::ZlibDecoder;
9use flate2::Compression;
10use std::io::{Read, Write};
11
12use crate::object::{ObjectId, ObjectType, GitObject};
13use crate::storage::GitStorage;
14use crate::{Error, Result};
15
16/// Pack object type encoding
17#[derive(Debug, Clone, Copy, PartialEq, Eq)]
18#[repr(u8)]
19pub enum PackObjectType {
20    Commit = 1,
21    Tree = 2,
22    Blob = 3,
23    Tag = 4,
24    // Delta types (6, 7) not implemented for simplicity
25}
26
27impl PackObjectType {
28    pub fn from_object_type(t: ObjectType) -> Self {
29        match t {
30            ObjectType::Commit => PackObjectType::Commit,
31            ObjectType::Tree => PackObjectType::Tree,
32            ObjectType::Blob => PackObjectType::Blob,
33            ObjectType::Tag => PackObjectType::Tag,
34        }
35    }
36
37    pub fn to_object_type(self) -> ObjectType {
38        match self {
39            PackObjectType::Commit => ObjectType::Commit,
40            PackObjectType::Tree => ObjectType::Tree,
41            PackObjectType::Blob => ObjectType::Blob,
42            PackObjectType::Tag => ObjectType::Tag,
43        }
44    }
45
46    pub fn from_u8(v: u8) -> Option<Self> {
47        match v {
48            1 => Some(PackObjectType::Commit),
49            2 => Some(PackObjectType::Tree),
50            3 => Some(PackObjectType::Blob),
51            4 => Some(PackObjectType::Tag),
52            _ => None,
53        }
54    }
55}
56
57/// Generate a packfile containing the given objects
58pub fn generate_packfile(storage: &GitStorage, oids: &[ObjectId]) -> Result<Vec<u8>> {
59    let mut pack = Vec::new();
60
61    // Header: "PACK" + version (2) + object count
62    pack.extend_from_slice(b"PACK");
63    pack.extend_from_slice(&2u32.to_be_bytes()); // version 2
64    pack.extend_from_slice(&(oids.len() as u32).to_be_bytes());
65
66    // Objects
67    for oid in oids {
68        let obj = storage.read_object(oid)?;
69        write_pack_object(&mut pack, &obj)?;
70    }
71
72    // Checksum: SHA-1 of everything before
73    let mut hasher = Sha1::new();
74    hasher.update(&pack);
75    let checksum = hasher.finalize();
76    pack.extend_from_slice(&checksum);
77
78    Ok(pack)
79}
80
81/// Write a single object to the packfile
82fn write_pack_object(pack: &mut Vec<u8>, obj: &GitObject) -> Result<()> {
83    let pack_type = PackObjectType::from_object_type(obj.obj_type);
84    let size = obj.content.len();
85
86    // Encode type and size in variable-length format
87    // First byte: 1-bit MSB continue flag, 3-bit type, 4-bit size LSB
88    let mut c = ((pack_type as u8) << 4) | ((size & 0x0F) as u8);
89    let mut remaining = size >> 4;
90
91    if remaining > 0 {
92        c |= 0x80; // More bytes follow
93    }
94    pack.push(c);
95
96    // Remaining size bytes: 7 bits each with MSB continue flag
97    while remaining > 0 {
98        let mut byte = (remaining & 0x7F) as u8;
99        remaining >>= 7;
100        if remaining > 0 {
101            byte |= 0x80;
102        }
103        pack.push(byte);
104    }
105
106    // Compress object content
107    let mut encoder = ZlibEncoder::new(Vec::new(), Compression::default());
108    encoder.write_all(&obj.content)?;
109    let compressed = encoder.finish()?;
110    pack.extend_from_slice(&compressed);
111
112    Ok(())
113}
114
115/// Parse a packfile, storing objects and returning their IDs
116pub fn parse_packfile(storage: &GitStorage, data: &[u8]) -> Result<Vec<ObjectId>> {
117    if data.len() < 20 {
118        return Err(Error::PackError("packfile too small".into()));
119    }
120
121    // Verify header
122    if &data[0..4] != b"PACK" {
123        return Err(Error::PackError("invalid packfile magic".into()));
124    }
125
126    let version = u32::from_be_bytes([data[4], data[5], data[6], data[7]]);
127    if version != 2 {
128        return Err(Error::PackError(format!("unsupported pack version: {}", version)));
129    }
130
131    let object_count = u32::from_be_bytes([data[8], data[9], data[10], data[11]]);
132
133    // Verify checksum
134    let checksum_start = data.len() - 20;
135    let mut hasher = Sha1::new();
136    hasher.update(&data[..checksum_start]);
137    let computed = hasher.finalize();
138    if &computed[..] != &data[checksum_start..] {
139        return Err(Error::PackError("checksum mismatch".into()));
140    }
141
142    // Parse objects
143    let mut pos = 12; // After header
144    let mut oids = Vec::with_capacity(object_count as usize);
145
146    for _ in 0..object_count {
147        let (obj, bytes_consumed) = parse_pack_object(&data[pos..checksum_start])?;
148        pos += bytes_consumed;
149
150        let oid = storage.write_object(&obj)?;
151        oids.push(oid);
152    }
153
154    Ok(oids)
155}
156
157/// Parse a single object from packfile data
158fn parse_pack_object(data: &[u8]) -> Result<(GitObject, usize)> {
159    let mut pos = 0;
160
161    // Read type and size
162    let first_byte = data[pos];
163    pos += 1;
164
165    let type_bits = (first_byte >> 4) & 0x07;
166    let pack_type = PackObjectType::from_u8(type_bits)
167        .ok_or_else(|| Error::PackError(format!("unsupported object type: {}", type_bits)))?;
168
169    let mut size = (first_byte & 0x0F) as usize;
170    let mut shift = 4;
171
172    // Read remaining size bytes
173    if first_byte & 0x80 != 0 {
174        loop {
175            if pos >= data.len() {
176                return Err(Error::PackError("truncated size".into()));
177            }
178            let byte = data[pos];
179            pos += 1;
180            size |= ((byte & 0x7F) as usize) << shift;
181            shift += 7;
182            if byte & 0x80 == 0 {
183                break;
184            }
185        }
186    }
187
188    // Decompress content
189    let mut decoder = ZlibDecoder::new(&data[pos..]);
190    let mut content = vec![0u8; size];
191    decoder.read_exact(&mut content)?;
192
193    // Calculate how many bytes of compressed data we consumed
194    let compressed_size = decoder.total_in() as usize;
195    pos += compressed_size;
196
197    let obj = GitObject::new(pack_type.to_object_type(), content);
198    Ok((obj, pos))
199}
200
201/// Thin packfile generation for upload-pack
202/// Generates a packfile with only the objects the client needs
203pub struct PackBuilder<'a> {
204    storage: &'a GitStorage,
205    /// Objects to include
206    want: Vec<ObjectId>,
207    /// Objects the client already has
208    have: Vec<ObjectId>,
209}
210
211impl<'a> PackBuilder<'a> {
212    pub fn new(storage: &'a GitStorage) -> Self {
213        Self {
214            storage,
215            want: Vec::new(),
216            have: Vec::new(),
217        }
218    }
219
220    pub fn want(&mut self, oid: ObjectId) {
221        self.want.push(oid);
222    }
223
224    pub fn have(&mut self, oid: ObjectId) {
225        self.have.push(oid);
226    }
227
228    /// Build the packfile, walking the object graph
229    pub fn build(self) -> Result<Vec<u8>> {
230        let mut needed = std::collections::HashSet::new();
231        let have_set: std::collections::HashSet<_> = self.have.iter().copied().collect();
232
233        // Walk from want commits to find all needed objects
234        for oid in &self.want {
235            Self::walk_object_static(self.storage, *oid, &have_set, &mut needed)?;
236        }
237
238        // Generate packfile
239        let oids: Vec<_> = needed.into_iter().collect();
240        generate_packfile(self.storage, &oids)
241    }
242
243    /// Recursively walk an object and its dependencies
244    fn walk_object_static(
245        storage: &GitStorage,
246        oid: ObjectId,
247        have: &std::collections::HashSet<ObjectId>,
248        needed: &mut std::collections::HashSet<ObjectId>,
249    ) -> Result<()> {
250        if have.contains(&oid) || needed.contains(&oid) {
251            return Ok(());
252        }
253
254        if !storage.has_object(&oid)? {
255            return Ok(()); // Object doesn't exist, skip
256        }
257
258        needed.insert(oid);
259
260        let obj = storage.read_object(&oid)?;
261
262        match obj.obj_type {
263            ObjectType::Commit => {
264                // Parse commit to find tree and parents
265                let content = String::from_utf8_lossy(&obj.content);
266                for line in content.lines() {
267                    if let Some(tree_hex) = line.strip_prefix("tree ") {
268                        if let Some(tree_oid) = ObjectId::from_hex(tree_hex.trim()) {
269                            Self::walk_object_static(storage, tree_oid, have, needed)?;
270                        }
271                    } else if let Some(parent_hex) = line.strip_prefix("parent ") {
272                        if let Some(parent_oid) = ObjectId::from_hex(parent_hex.trim()) {
273                            Self::walk_object_static(storage, parent_oid, have, needed)?;
274                        }
275                    } else if line.is_empty() {
276                        break; // End of headers
277                    }
278                }
279            }
280            ObjectType::Tree => {
281                // Parse tree entries
282                let entries = crate::object::parse_tree(&obj.content)?;
283                for entry in entries {
284                    Self::walk_object_static(storage, entry.oid, have, needed)?;
285                }
286            }
287            ObjectType::Tag => {
288                // Parse tag to find object
289                let content = String::from_utf8_lossy(&obj.content);
290                for line in content.lines() {
291                    if let Some(obj_hex) = line.strip_prefix("object ") {
292                        if let Some(obj_oid) = ObjectId::from_hex(obj_hex.trim()) {
293                            Self::walk_object_static(storage, obj_oid, have, needed)?;
294                        }
295                    }
296                }
297            }
298            ObjectType::Blob => {
299                // Blobs have no dependencies
300            }
301        }
302
303        Ok(())
304    }
305}
306
307#[cfg(test)]
308mod tests {
309    use super::*;
310    use tempfile::tempdir;
311
312    #[test]
313    fn test_packfile_roundtrip() {
314        let dir = tempdir().unwrap();
315        let storage = GitStorage::open(dir.path().join("git")).unwrap();
316
317        // Create some objects
318        let blob1 = storage.write_blob(b"hello").unwrap();
319        let blob2 = storage.write_blob(b"world").unwrap();
320
321        // Generate packfile
322        let pack = generate_packfile(&storage, &[blob1, blob2]).unwrap();
323
324        // Verify header
325        assert_eq!(&pack[0..4], b"PACK");
326
327        // Parse in new storage
328        let dir2 = tempdir().unwrap();
329        let storage2 = GitStorage::open(dir2.path().join("git")).unwrap();
330        let parsed_oids = parse_packfile(&storage2, &pack).unwrap();
331
332        assert_eq!(parsed_oids.len(), 2);
333        assert!(storage2.has_object(&blob1).unwrap());
334        assert!(storage2.has_object(&blob2).unwrap());
335    }
336}