git_plumber/git/pack/
index.rs

1use nom::{
2    IResult, Parser,
3    bytes::complete::{tag, take},
4    error::{Error, ErrorKind},
5    multi::count,
6    number::complete::{be_u32, be_u64},
7};
8use std::fmt;
9
10use crate::git::pack::PackError;
11
12/// Represents a Git pack index file (.idx)
13///
14/// Pack index files provide efficient lookup into pack files by mapping
15/// object SHA-1 hashes to their byte offsets within the corresponding pack file.
16#[derive(Debug, Clone)]
17pub struct PackIndex {
18    /// Index file format version (should be 2)
19    pub version: u32,
20    /// Fan-out table: 256 entries indicating object count for each first byte
21    pub fan_out: [u32; 256],
22    /// Sorted array of 20-byte SHA-1 object names
23    pub object_names: Vec<[u8; 20]>,
24    /// CRC32 checksums for packed object data (for integrity verification)
25    pub crc32_checksums: Vec<u32>,
26    /// 4-byte offsets into the pack file for each object
27    pub offsets: Vec<u32>,
28    /// Optional 8-byte offsets for large pack files (when 4-byte offset has MSB set)
29    pub large_offsets: Option<Vec<u64>>,
30    /// SHA-1 checksum of the corresponding pack file
31    pub pack_checksum: [u8; 20],
32    /// SHA-1 checksum of all the index data above
33    pub index_checksum: [u8; 20],
34    /// Raw data for debugging/display purposes
35    pub raw_data: Vec<u8>,
36}
37
38impl PackIndex {
39    /// Parse a pack index file from raw bytes
40    pub fn parse(input: &[u8]) -> IResult<&[u8], Self> {
41        let original_input = input;
42
43        // Parse header (magic + version)
44        let (input, _) = Self::parse_header(input)?;
45
46        // Parse fan-out table (256 entries)
47        let (input, fan_out) = Self::parse_fan_out_table(input)?;
48
49        // Get total object count from last fan-out entry
50        let total_objects = fan_out[255] as usize;
51
52        // Parse object names (20 bytes each)
53        let (input, object_names) = count(Self::parse_object_name, total_objects).parse(input)?;
54
55        // Parse CRC32 table (4 bytes per object)
56        let (input, crc32_checksums) = count(be_u32, total_objects).parse(input)?;
57
58        // Parse offset table (4 bytes per object)
59        let (input, offsets) = count(be_u32, total_objects).parse(input)?;
60
61        // Check if we need large offsets (any offset has MSB set)
62        let needs_large_offsets = offsets.iter().any(|&offset| offset & 0x80000000 != 0);
63        let large_offset_count = offsets
64            .iter()
65            .filter(|&&offset| offset & 0x80000000 != 0)
66            .count();
67
68        // Parse large offset table if needed
69        let (input, large_offsets) = if needs_large_offsets {
70            let (input, large_offsets) = count(be_u64, large_offset_count).parse(input)?;
71            (input, Some(large_offsets))
72        } else {
73            (input, None)
74        };
75
76        // Parse pack checksum (20 bytes)
77        let (input, pack_checksum_bytes) = take(20usize)(input)?;
78        let mut pack_checksum = [0u8; 20];
79        pack_checksum.copy_from_slice(pack_checksum_bytes);
80
81        // Parse index checksum (20 bytes)
82        let (input, index_checksum_bytes) = take(20usize)(input)?;
83        let mut index_checksum = [0u8; 20];
84        index_checksum.copy_from_slice(index_checksum_bytes);
85
86        // Calculate raw data size (everything we've consumed)
87        let consumed = original_input.len() - input.len();
88        let raw_data = original_input[..consumed].to_vec();
89
90        Ok((
91            input,
92            PackIndex {
93                version: 2, // We only support version 2
94                fan_out,
95                object_names,
96                crc32_checksums,
97                offsets,
98                large_offsets,
99                pack_checksum,
100                index_checksum,
101                raw_data,
102            },
103        ))
104    }
105
106    /// Parse the index header (magic number + version)
107    fn parse_header(input: &[u8]) -> IResult<&[u8], ()> {
108        // Magic number for version 2: \377tOc (0xff744f63)
109        let magic_bytes: &[u8] = &[0xff, 0x74, 0x4f, 0x63];
110        let (input, _) = tag(magic_bytes)(input)?;
111
112        // Version number (should be 2)
113        let (input, version) = be_u32(input)?;
114
115        if version != 2 {
116            return Err(nom::Err::Error(Error::new(input, ErrorKind::Tag)));
117        }
118
119        Ok((input, ()))
120    }
121
122    /// Parse the fan-out table (256 entries)
123    fn parse_fan_out_table(input: &[u8]) -> IResult<&[u8], [u32; 256]> {
124        let (input, fan_out_vec) = count(be_u32, 256).parse(input)?;
125
126        // Convert Vec to array
127        let mut fan_out = [0u32; 256];
128        for (i, &value) in fan_out_vec.iter().enumerate() {
129            fan_out[i] = value;
130        }
131
132        // Validate fan-out table is monotonic
133        for i in 1..256 {
134            if fan_out[i] < fan_out[i - 1] {
135                return Err(nom::Err::Error(Error::new(input, ErrorKind::Verify)));
136            }
137        }
138
139        Ok((input, fan_out))
140    }
141
142    /// Parse a single 20-byte object name (SHA-1)
143    fn parse_object_name(input: &[u8]) -> IResult<&[u8], [u8; 20]> {
144        let (input, name_bytes) = take(20usize)(input)?;
145        let mut name = [0u8; 20];
146        name.copy_from_slice(name_bytes);
147        Ok((input, name))
148    }
149
150    /// Get the total number of objects in this index
151    pub fn object_count(&self) -> usize {
152        self.object_names.len()
153    }
154
155    /// Look up an object by its SHA-1 hash
156    /// Returns the offset in the pack file if found
157    pub fn lookup_object(&self, sha1: &[u8; 20]) -> Option<u64> {
158        // Use fan-out table for efficient binary search
159        let first_byte = sha1[0] as usize;
160
161        // Determine search range using fan-out table
162        let start_idx = if first_byte == 0 {
163            0
164        } else {
165            self.fan_out[first_byte - 1] as usize
166        };
167        let end_idx = self.fan_out[first_byte] as usize;
168
169        // Binary search within the range
170        let search_slice = &self.object_names[start_idx..end_idx];
171
172        match search_slice.binary_search(sha1) {
173            Ok(relative_idx) => {
174                let absolute_idx = start_idx + relative_idx;
175                Some(self.get_object_offset(absolute_idx))
176            }
177            Err(_) => None,
178        }
179    }
180
181    /// Get the pack file offset for an object at the given index
182    pub fn get_object_offset(&self, index: usize) -> u64 {
183        if index >= self.offsets.len() {
184            return 0;
185        }
186
187        let offset = self.offsets[index];
188
189        // Check if this is a large offset (MSB set)
190        if offset & 0x80000000 != 0 {
191            // Use large offset table
192            let large_offset_index = (offset & 0x7fffffff) as usize;
193            if let Some(ref large_offsets) = self.large_offsets
194                && large_offset_index < large_offsets.len()
195            {
196                return large_offsets[large_offset_index];
197            }
198        }
199
200        offset as u64
201    }
202
203    /// Get the CRC32 checksum for an object at the given index
204    pub fn get_object_crc32(&self, index: usize) -> Option<u32> {
205        self.crc32_checksums.get(index).copied()
206    }
207
208    /// Verify the integrity of the index file
209    pub fn verify_checksum(&self) -> Result<(), PackError> {
210        // TODO: Implement SHA-1 checksum verification
211        // This would involve calculating SHA-1 of all data except the final 20 bytes
212        // and comparing with self.index_checksum
213        Ok(())
214    }
215}
216
217impl fmt::Display for PackIndex {
218    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
219        writeln!(f, "Pack Index (version {})", self.version)?;
220        writeln!(f, "Total objects: {}", self.object_count())?;
221        writeln!(f, "Pack checksum: {}", hex::encode(self.pack_checksum))?;
222        writeln!(f, "Index checksum: {}", hex::encode(self.index_checksum))?;
223
224        if let Some(ref large_offsets) = self.large_offsets {
225            writeln!(f, "Large offsets: {} entries", large_offsets.len())?;
226        }
227
228        // Show distribution from fan-out table
229        writeln!(f, "\nObject distribution by first byte:")?;
230        let mut prev_count = 0;
231        for (byte, &count) in self.fan_out.iter().enumerate() {
232            let objects_for_byte = count - prev_count;
233            if objects_for_byte > 0 {
234                writeln!(f, "  0x{:02x}: {} objects", byte, objects_for_byte)?;
235            }
236            prev_count = count;
237        }
238
239        Ok(())
240    }
241}
242
243#[cfg(test)]
244mod tests {
245    use super::*;
246
247    #[test]
248    fn test_parse_header() {
249        let header_data = [
250            0xff, 0x74, 0x4f, 0x63, // Magic number
251            0x00, 0x00, 0x00, 0x02, // Version 2
252        ];
253
254        let (remaining, _) = PackIndex::parse_header(&header_data).unwrap();
255        assert!(remaining.is_empty());
256    }
257
258    #[test]
259    fn test_parse_object_name() {
260        let name_data = [0x01; 20]; // 20 bytes of 0x01
261
262        let (remaining, name) = PackIndex::parse_object_name(&name_data).unwrap();
263        assert!(remaining.is_empty());
264        assert_eq!(name, [0x01; 20]);
265    }
266
267    #[test]
268    fn test_fan_out_validation() {
269        // Valid monotonic fan-out table
270        let mut fan_out_data = Vec::new();
271        for i in 0..256 {
272            fan_out_data.extend_from_slice(&(i as u32).to_be_bytes());
273        }
274
275        let (_, fan_out) = PackIndex::parse_fan_out_table(&fan_out_data).unwrap();
276        assert_eq!(fan_out[0], 0);
277        assert_eq!(fan_out[255], 255);
278    }
279
280    #[test]
281    fn test_lookup_object() {
282        // Create a minimal index for testing
283        let mut fan_out = [0u32; 256];
284        fan_out[0] = 1; // One object starting with 0x00
285        for i in 1..256 {
286            fan_out[i] = 1; // All subsequent entries have the same count
287        }
288
289        let object_names = vec![[0x00; 20]]; // One object with all zeros
290        let crc32_checksums = vec![0x12345678];
291        let offsets = vec![100]; // Offset 100 in pack file
292
293        let index = PackIndex {
294            version: 2,
295            fan_out,
296            object_names,
297            crc32_checksums,
298            offsets,
299            large_offsets: None,
300            pack_checksum: [0; 20],
301            index_checksum: [0; 20],
302            raw_data: vec![],
303        };
304
305        // Should find the object
306        assert_eq!(index.lookup_object(&[0x00; 20]), Some(100));
307
308        // Should not find a different object
309        assert_eq!(index.lookup_object(&[0xff; 20]), None);
310    }
311
312    #[test]
313    fn test_multiple_objects_lookup() {
314        // Create an index with multiple objects for more realistic testing
315        let mut fan_out = [0u32; 256];
316
317        // Set up fan-out table for objects starting with 0x00, 0x01, 0xaa, 0xff
318        fan_out[0] = 2; // 2 objects starting with 0x00 or less
319        fan_out[1] = 3; // 3 objects starting with 0x01 or less  
320        for i in 2..0xaa {
321            fan_out[i] = 3; // Same count up to 0xaa
322        }
323        fan_out[0xaa] = 4; // 4 objects starting with 0xaa or less
324        for i in 0xab..0xff {
325            fan_out[i] = 4; // Same count up to 0xff
326        }
327        fan_out[0xff] = 5; // 5 objects total
328
329        // Create 5 test objects with specific first bytes
330        let mut obj1 = [0u8; 20];
331        obj1[0] = 0x00;
332        let mut obj2 = [0u8; 20];
333        obj2[0] = 0x00;
334        obj2[1] = 0x01; // Different from obj1
335        let mut obj3 = [0u8; 20];
336        obj3[0] = 0x01;
337        let mut obj4 = [0u8; 20];
338        obj4[0] = 0xaa;
339        let mut obj5 = [0u8; 20];
340        obj5[0] = 0xff;
341
342        let object_names = vec![obj1, obj2, obj3, obj4, obj5];
343        let crc32_checksums = vec![0x11111111, 0x22222222, 0x33333333, 0x44444444, 0x55555555];
344        let offsets = vec![100, 200, 300, 400, 500];
345
346        let index = PackIndex {
347            version: 2,
348            fan_out,
349            object_names,
350            crc32_checksums,
351            offsets,
352            large_offsets: None,
353            pack_checksum: [0; 20],
354            index_checksum: [0; 20],
355            raw_data: vec![],
356        };
357
358        // Test lookups
359        assert_eq!(index.lookup_object(&obj1), Some(100));
360        assert_eq!(index.lookup_object(&obj2), Some(200));
361        assert_eq!(index.lookup_object(&obj3), Some(300));
362        assert_eq!(index.lookup_object(&obj4), Some(400));
363        assert_eq!(index.lookup_object(&obj5), Some(500));
364
365        // Test non-existent object
366        let mut nonexistent = [0u8; 20];
367        nonexistent[0] = 0x80; // First byte that doesn't match any object
368        assert_eq!(index.lookup_object(&nonexistent), None);
369
370        // Test CRC32 access
371        assert_eq!(index.get_object_crc32(0), Some(0x11111111));
372        assert_eq!(index.get_object_crc32(4), Some(0x55555555));
373        assert_eq!(index.get_object_crc32(10), None); // Out of bounds
374    }
375
376    #[test]
377    fn test_large_offsets() {
378        // Test large offset handling
379        let mut fan_out = [0u32; 256];
380        fan_out[0] = 1;
381        for i in 1..256 {
382            fan_out[i] = 1;
383        }
384
385        let object_names = vec![[0x00; 20]];
386        let crc32_checksums = vec![0x12345678];
387        let offsets = vec![0x80000000]; // MSB set indicates large offset
388        let large_offsets = Some(vec![0x123456789abcdef0]); // Large 8-byte offset
389
390        let index = PackIndex {
391            version: 2,
392            fan_out,
393            object_names,
394            crc32_checksums,
395            offsets,
396            large_offsets,
397            pack_checksum: [0; 20],
398            index_checksum: [0; 20],
399            raw_data: vec![],
400        };
401
402        // Should return the large offset
403        assert_eq!(index.get_object_offset(0), 0x123456789abcdef0);
404        assert_eq!(index.lookup_object(&[0x00; 20]), Some(0x123456789abcdef0));
405    }
406}