git_internal/internal/pack/
pack_index.rs

1//! Builder for Git pack index (.idx) files that streams fanout tables, CRCs, offsets, and trailer
2//! hashes through an async channel.
3
4use tokio::sync::mpsc;
5
6pub use crate::internal::pack::index_entry::IndexEntry;
7use crate::{errors::GitError, hash::ObjectHash, utils::HashAlgorithm};
8
9/// Builder for Git pack index (.idx) files that streams data through an async channel.
10/// # Arguments
11/// * `object_number` - Total number of objects in the pack file.
12/// * `sender` - Async channel sender to stream idx data.
13/// * `pack_hash` - Hash of the corresponding pack file (used in the idx trailer).
14/// * `inner_hash` - Hash algorithm instance to compute the idx file hash.
15pub struct IdxBuilder {
16    sender: Option<mpsc::Sender<Vec<u8>>>,
17    inner_hash: HashAlgorithm, //  idx trailer
18    object_number: usize,
19    pack_hash: ObjectHash,
20}
21
22impl IdxBuilder {
23    /// Create a new IdxBuilder.
24    pub fn new(object_number: usize, sender: mpsc::Sender<Vec<u8>>, pack_hash: ObjectHash) -> Self {
25        Self {
26            sender: Some(sender),
27            inner_hash: HashAlgorithm::new(),
28            object_number,
29            pack_hash,
30        }
31    }
32
33    /// Drop the sender to close the channel.
34    pub fn drop_sender(&mut self) {
35        self.sender.take(); // Take the sender out, dropping it
36    }
37
38    /// Send data through the channel and update the inner hash.
39    async fn send_data(&mut self, data: Vec<u8>) -> Result<(), GitError> {
40        if let Some(sender) = &self.sender {
41            self.inner_hash.update(&data);
42            sender.send(data).await.map_err(|e| {
43                GitError::IOError(std::io::Error::new(
44                    std::io::ErrorKind::BrokenPipe,
45                    format!("Failed to send idx data: {e}"),
46                ))
47            })?;
48        }
49        Ok(())
50    }
51
52    /// Send data through the channel without updating the inner hash.
53    async fn send_data_without_update_hash(&mut self, data: Vec<u8>) -> Result<(), GitError> {
54        if let Some(sender) = &self.sender {
55            sender.send(data).await.map_err(|e| {
56                GitError::IOError(std::io::Error::new(
57                    std::io::ErrorKind::BrokenPipe,
58                    format!("Failed to send idx data: {e}"),
59                ))
60            })?;
61        }
62        Ok(())
63    }
64
65    /// send u32 value (big-endian)
66    async fn send_u32(&mut self, v: u32) -> Result<(), GitError> {
67        self.send_data(v.to_be_bytes().to_vec()).await
68    }
69
70    /// send u64 value (big-endian)
71    async fn send_u64(&mut self, v: u64) -> Result<(), GitError> {
72        self.send_data(v.to_be_bytes().to_vec()).await
73    }
74
75    /// Write the idx v2 header (Git pack index format, used for both SHA1 and SHA256).
76    /// The 4-byte pack index signature: \377t0c, followed by 4-byte version number: 2.
77    async fn write_header(&mut self) -> Result<(), GitError> {
78        // .idx v2 header (used for both SHA1 and SHA256)
79        // magic: FF 74 4F 63, version: 2
80        let header: [u8; 8] = [0xFF, 0x74, 0x4F, 0x63, 0, 0, 0, 2];
81        self.send_data(header.to_vec()).await
82    }
83
84    /// Write the fanout table for the index.
85    async fn write_fanout(&mut self, entries: &mut [IndexEntry]) -> Result<(), GitError> {
86        entries.sort_by(|a, b| a.hash.cmp(&b.hash));
87        let mut fanout = [0u32; 256];
88        for entry in entries.iter() {
89            fanout[entry.hash.to_data()[0] as usize] += 1;
90        }
91
92        // Calculate cumulative counts
93        for i in 1..fanout.len() {
94            fanout[i] += fanout[i - 1];
95        }
96
97        // Send all 256 cumulative counts
98        for &count in fanout.iter() {
99            self.send_u32(count).await?;
100        }
101
102        Ok(())
103    }
104
105    /// Write the object names (hashes) to the index.
106    async fn write_names(&mut self, entries: &Vec<IndexEntry>) -> Result<(), GitError> {
107        for e in entries {
108            self.send_data(e.hash.to_data().clone()).await?;
109        }
110
111        Ok(())
112    }
113
114    /// Write the CRC32 checksums for each object in the index.
115    async fn write_crc32(&mut self, entries: &Vec<IndexEntry>) -> Result<(), GitError> {
116        for e in entries {
117            self.send_u32(e.crc32).await?;
118        }
119
120        Ok(())
121    }
122
123    /// Write the offsets for each object in the index, handling large offsets.
124    async fn write_offsets(&mut self, entries: &Vec<IndexEntry>) -> Result<(), GitError> {
125        let mut large = vec![];
126        for e in entries {
127            if e.offset <= 0x7FFF_FFFF {
128                // normal 31-bit offset
129                self.send_u32(e.offset as u32).await?;
130            } else {
131                // MSB=1 => large offset reference , a label for large offset
132                let marker = 0x8000_0000 | large.len() as u32;
133                self.send_u32(marker).await?;
134                large.push(e.offset);
135            }
136        }
137        for v in large {
138            self.send_u64(v).await?;
139        }
140        Ok(())
141    }
142
143    /// Write the idx trailer containing the pack hash and idx file hash.
144    async fn write_trailer(&mut self) -> Result<(), GitError> {
145        // pack hash
146        self.send_data_without_update_hash(self.pack_hash.to_data().clone())
147            .await?;
148
149        let idx_hash = self.inner_hash.clone().finalize();
150        // idx file hash
151        self.send_data(idx_hash).await?;
152        Ok(())
153    }
154
155    /// Write the complete idx file by sending header, fanout, names, CRCs, offsets, and trailer.
156    pub async fn write_idx(&mut self, mut entries: Vec<IndexEntry>) -> Result<(), GitError> {
157        // check entries length
158        if entries.len() != self.object_number {
159            return Err(GitError::ConversionError(format!(
160                "entries length {} != object_number {}",
161                entries.len(),
162                self.object_number
163            )));
164        }
165
166        // write header
167        self.write_header().await?;
168        self.write_fanout(&mut entries).await?;
169        self.write_names(&entries).await?;
170        self.write_crc32(&entries).await?;
171        self.write_offsets(&entries).await?;
172        self.write_trailer().await?;
173        self.drop_sender();
174        Ok(())
175    }
176}
177
178#[cfg(test)]
179mod tests {
180    use tokio::sync::mpsc;
181
182    use crate::{
183        errors::GitError,
184        hash::ObjectHash,
185        internal::pack::{index_entry::IndexEntry, pack_index::IdxBuilder},
186    };
187
188    /// construct fake sha1 hash
189    fn fake_sha1(n: u8) -> ObjectHash {
190        ObjectHash::Sha1([n; 20])
191    }
192
193    /// construct entries (hashes from 1, 2, 3… for fanout testing)
194    fn build_entries_sha1(n: usize) -> Vec<IndexEntry> {
195        (0..n)
196            .map(|i| IndexEntry {
197                hash: fake_sha1(i as u8),
198                crc32: 0x12345678 + i as u32,
199                offset: 0x10 + (i as u64) * 3,
200            })
201            .collect()
202    }
203
204    /// Test basic idx building for SHA1 pack index.
205    #[tokio::test]
206    async fn test_idx_builder_sha1_basic() -> Result<(), GitError> {
207        // mock channel catcher
208        let (tx, mut rx) = mpsc::channel::<Vec<u8>>(4096);
209
210        let object_number = 3;
211        let pack_hash = fake_sha1(0xAA);
212
213        let mut builder = IdxBuilder::new(object_number, tx, pack_hash);
214
215        let entries = build_entries_sha1(object_number);
216
217        // execute idx write
218        builder.write_idx(entries).await?;
219
220        // collect all written byte chunks
221        let mut out: Vec<u8> = Vec::new();
222        while let Some(chunk) = rx.recv().await {
223            out.extend_from_slice(&chunk);
224        }
225
226        // ------- assert header -------
227        // .idx v2 magic: FF 74 4F 63 00000002
228        assert_eq!(&out[0..8], &[0xFF, 0x74, 0x4F, 0x63, 0, 0, 0, 2]);
229
230        // ------- fanout -------
231        // fanout has 256 * 4 bytes, starting from offset 8
232        let fanout_start = 8;
233        let fanout_end = fanout_start + 256 * 4;
234        let fanout_bytes = &out[fanout_start..fanout_end];
235
236        // Because the first byte of the hash is 0,1,2, fanout[0]=1 fanout[1]=2 fanout[2]=3, the rest=3
237        let mut fanout = [0u32; 256];
238        fanout[0] = 1;
239        fanout[1] = 2;
240        fanout[2] = 3;
241        for i in 3..256 {
242            fanout[i] = 3;
243        }
244
245        for i in 0..256 {
246            let idx = i * 4;
247            let v = u32::from_be_bytes([
248                fanout_bytes[idx],
249                fanout_bytes[idx + 1],
250                fanout_bytes[idx + 2],
251                fanout_bytes[idx + 3],
252            ]);
253            assert_eq!(v, fanout[i], "fanout mismatch at index {i}");
254        }
255
256        // ------- names -------
257        let names_start = fanout_end;
258        let names_end = names_start + object_number * 20; // sha1 = 20 bytes
259        let names_bytes = &out[names_start..names_end];
260
261        for i in 0..object_number {
262            let name = &names_bytes[i * 20..i * 20 + 20];
263            assert!(name.iter().all(|b| *b == i as u8));
264        }
265
266        // ------- crc32 -------
267        let crc_start = names_end;
268        let crc_end = crc_start + object_number * 4;
269        let crc_bytes = &out[crc_start..crc_end];
270
271        for i in 0..object_number {
272            let expected = 0x12345678 + i as u32;
273            let actual = u32::from_be_bytes([
274                crc_bytes[4 * i],
275                crc_bytes[4 * i + 1],
276                crc_bytes[4 * i + 2],
277                crc_bytes[4 * i + 3],
278            ]);
279            assert_eq!(expected, actual);
280        }
281
282        // ------- offsets -------
283        let offset_start = crc_end;
284        let offset_end = offset_start + object_number * 4;
285        let offsets_bytes = &out[offset_start..offset_end];
286
287        for i in 0..object_number {
288            let expected = 0x10 + (i as u64) * 3;
289            let actual = u32::from_be_bytes([
290                offsets_bytes[i * 4],
291                offsets_bytes[i * 4 + 1],
292                offsets_bytes[i * 4 + 2],
293                offsets_bytes[i * 4 + 3],
294            ]);
295            assert_eq!(expected as u32, actual);
296        }
297
298        // ------- pack hash -------
299        let trailer_pack_hash_start = offset_end;
300        let trailer_pack_hash_end = trailer_pack_hash_start + 20;
301        let pack_hash_bytes = &out[trailer_pack_hash_start..trailer_pack_hash_end];
302        assert!(pack_hash_bytes.iter().all(|b| *b == 0xAA));
303
304        // ------- idx hash (cannot be exactly the same as git, but should have a value) -------
305        let idx_hash = &out[trailer_pack_hash_end..trailer_pack_hash_end + 20];
306        assert_eq!(idx_hash.len(), 20);
307
308        Ok(())
309    }
310}