Skip to main content

git_internal/internal/pack/
utils.rs

1//! Shared pack-parsing helpers for reading object headers, varints, offsets, and zlib-compressed
2//! payloads as defined by Git's pack format.
3
4use std::{
5    fs,
6    io::{self, Read},
7    path::Path,
8};
9
10use sha1::{Digest, Sha1};
11
12use crate::{
13    hash::{ObjectHash, get_hash_kind},
14    internal::object::types::ObjectType,
15};
16
17/// Checks if the reader has reached EOF (end of file).
18///
19/// It attempts to read a single byte from the reader into a buffer.
20/// If `Ok(0)` is returned, it means no byte was read, indicating
21/// that the end of the stream has been reached and there is no more
22/// data left to read.
23///
24/// Any other return value means that data was successfully read, so
25/// the reader has not reached the end yet.
26///
27/// # Arguments
28///
29/// * `reader` - The reader to check for EOF state
30///   It must implement the `std::io::Read` trait
31///
32/// # Returns
33///
34/// true if the reader reached EOF, false otherwise
35pub fn is_eof(reader: &mut dyn Read) -> bool {
36    let mut buf = [0; 1];
37    matches!(reader.read(&mut buf), Ok(0))
38}
39
40/// Reads a byte from the given stream and checks if there are more bytes to continue reading.
41///
42/// The return value includes two parts: an unsigned integer formed by the first 7 bits of the byte,
43/// and a boolean value indicating whether more bytes need to be read.
44///
45/// # Parameters
46/// * `stream`: The stream from which the byte is read.
47///
48/// # Returns
49/// Returns an `io::Result` containing a tuple. The first element is the value of the first 7 bits,
50/// and the second element is a boolean indicating whether more bytes need to be read.
51///
52pub fn read_byte_and_check_continuation<R: Read>(stream: &mut R) -> io::Result<(u8, bool)> {
53    // Create a buffer for a single byte
54    let mut bytes = [0; 1];
55
56    // Read exactly one byte from the stream into the buffer
57    stream.read_exact(&mut bytes)?;
58
59    // Extract the byte from the buffer
60    let byte = bytes[0];
61
62    // Extract the first 7 bits of the byte
63    let value = byte & 0b0111_1111;
64
65    // Check if the most significant bit (8th bit) is set, indicating more bytes to follow
66    let msb = byte >= 128;
67
68    // Return the extracted value and the continuation flag
69    Ok((value, msb))
70}
71
72/// Reads bytes from the stream and parses the first byte for type and size.
73/// Subsequent bytes are read as size bytes and are processed as variable-length
74/// integer in little-endian order. The function returns the type and the computed size.
75///
76/// # Parameters
77/// * `stream`: The stream from which the bytes are read.
78/// * `offset`: The offset of the stream.
79///
80/// # Returns
81/// Returns an `io::Result` containing a tuple of the type and the computed size.
82///
83pub fn read_type_and_varint_size<R: Read>(
84    stream: &mut R,
85    offset: &mut usize,
86) -> io::Result<(u8, usize)> {
87    let (first_byte, continuation) = read_byte_and_check_continuation(stream)?;
88
89    // Increment the offset by one byte
90    *offset += 1;
91
92    // Extract the type (bits 2, 3, 4 of the first byte)
93    let type_bits = (first_byte & 0b0111_0000) >> 4;
94
95    // Initialize size with the last 4 bits of the first byte
96    let mut size: u64 = (first_byte & 0b0000_1111) as u64;
97    let mut shift = 4; // Next byte will shift by 4 bits
98
99    let mut more_bytes = continuation;
100    while more_bytes {
101        let (next_byte, continuation) = read_byte_and_check_continuation(stream)?;
102        // Increment the offset by one byte
103        *offset += 1;
104
105        size |= (next_byte as u64) << shift;
106        shift += 7; // Each subsequent byte contributes 7 more bits
107        more_bytes = continuation;
108    }
109
110    Ok((type_bits, size as usize))
111}
112
113/// Reads a variable-length integer (VarInt) encoded in little-endian format from a source implementing the Read trait.
114///
115/// The VarInt encoding uses the most significant bit (MSB) of each byte as a continuation bit.
116/// The continuation bit being 1 indicates that there are following bytes.
117/// The actual integer value is encoded in the remaining 7 bits of each byte.
118///
119/// # Parameters
120/// * `reader`: A source implementing the Read trait (e.g., file, network stream).
121///
122/// # Returns
123/// Returns a `Result` containing either:
124/// * A tuple of the decoded `u64` value and the number of bytes read (`offset`).
125/// * An `io::Error` in case of any reading error or if the VarInt is too long.
126///
127pub fn read_varint_le<R: Read>(reader: &mut R) -> io::Result<(u64, usize)> {
128    // The decoded value
129    let mut value: u64 = 0;
130    // Bit shift for the next byte
131    let mut shift = 0;
132    // Number of bytes read
133    let mut offset = 0;
134
135    loop {
136        // A buffer to read a single byte
137        let mut buf = [0; 1];
138        // Read one byte from the reader
139        reader.read_exact(&mut buf)?;
140
141        // The byte just read
142        let byte = buf[0];
143        if shift > 63 {
144            // VarInt too long for u64
145            return Err(io::Error::new(
146                io::ErrorKind::InvalidData,
147                "VarInt too long",
148            ));
149        }
150
151        // Take the lower 7 bits of the byte
152        let byte_value = (byte & 0x7F) as u64;
153        // Add the byte value to the result, considering the shift
154        value |= byte_value << shift;
155
156        // Increment the byte count
157        offset += 1;
158        // Check if the MSB is 0 (last byte)
159        if byte & 0x80 == 0 {
160            break;
161        }
162
163        // Increment the shift for the next byte
164        shift += 7;
165    }
166
167    Ok((value, offset))
168}
169
170/// The offset for an OffsetDelta object(big-endian order)
171/// # Arguments
172///
173/// * `stream`: Input Data Stream to read
174/// # Returns
175/// * (`delta_offset`(unsigned), `consume`)
176pub fn read_offset_encoding<R: Read>(stream: &mut R) -> io::Result<(u64, usize)> {
177    // Like the object length, the offset for an OffsetDelta object
178    // is stored in a variable number of bytes,
179    // with the most significant bit of each byte indicating whether more bytes follow.
180    // However, the object length encoding allows redundant values,
181    // e.g. the 7-bit value [n] is the same as the 14- or 21-bit values [n, 0] or [n, 0, 0].
182    // Instead, the offset encoding adds 1 to the value of each byte except the least significant one.
183    // And just for kicks, the bytes are ordered from *most* to *least* significant.
184    let mut value = 0;
185    let mut offset = 0;
186    loop {
187        let (byte_value, more_bytes) = read_byte_and_check_continuation(stream)?;
188        offset += 1;
189        value = (value << 7) | byte_value as u64;
190        if !more_bytes {
191            return Ok((value, offset));
192        }
193
194        value += 1; //important!: for n >= 2 adding 2^7 + 2^14 + ... + 2^(7*(n-1)) to the result
195    }
196}
197
198/// Read the next N bytes from the reader
199///
200#[inline]
201pub fn read_bytes<R: Read, const N: usize>(stream: &mut R) -> io::Result<[u8; N]> {
202    let mut bytes = [0; N];
203    stream.read_exact(&mut bytes)?;
204
205    Ok(bytes)
206}
207
208/// Reads a partial integer from a stream. (little-endian order)
209///
210/// # Arguments
211///
212/// * `stream` - A mutable reference to a readable stream.
213/// * `bytes` - The number of bytes to read from the stream.
214/// * `present_bytes` - A mutable reference to a byte indicating which bits are present in the integer value.
215///
216/// # Returns
217///
218/// This function returns a result of type `io::Result<usize>`. If the operation is successful, the integer value
219/// read from the stream is returned as `Ok(value)`. Otherwise, an `Err` variant is returned, wrapping an `io::Error`
220/// that describes the specific error that occurred.
221pub fn read_partial_int<R: Read>(
222    stream: &mut R,
223    bytes: u8,
224    present_bytes: &mut u8,
225) -> io::Result<usize> {
226    let mut value: usize = 0;
227
228    // Iterate over the byte indices
229    for byte_index in 0..bytes {
230        // Check if the current bit is present
231        if *present_bytes & 1 != 0 {
232            // Read a byte from the stream
233            let [byte] = read_bytes(stream)?;
234
235            // Add the byte value to the integer value
236            value |= (byte as usize) << (byte_index * 8);
237        }
238
239        // Shift the present bytes to the right
240        *present_bytes >>= 1;
241    }
242
243    Ok(value)
244}
245
246/// Reads the base size and result size of a delta object from the given stream.
247///
248/// **Note**: The stream MUST be positioned at the start of the delta object.
249///
250/// The base size and result size are encoded as variable-length integers in little-endian order.
251///
252/// The base size is the size of the base object, and the result size is the size of the result object.
253///
254/// # Parameters
255/// * `stream`: The stream from which the sizes are read.
256///
257/// # Returns
258/// Returns a tuple containing the base size and result size.
259///
260pub fn read_delta_object_size<R: Read>(stream: &mut R) -> io::Result<(usize, usize)> {
261    let base_size = read_varint_le(stream)?.0 as usize;
262    let result_size = read_varint_le(stream)?.0 as usize;
263    Ok((base_size, result_size))
264}
265
266/// Calculate the SHA1 hash of the given object.
267/// <br> "`<type> <size>\0<content>`"
268/// <br> data: The decompressed content of the object
269pub fn calculate_object_hash(obj_type: ObjectType, data: &Vec<u8>) -> ObjectHash {
270    let type_bytes = obj_type
271        .to_bytes()
272        .expect("calculate_object_hash called with a delta type that has no loose-object header");
273    match get_hash_kind() {
274        crate::hash::HashKind::Sha1 => {
275            let mut hash = Sha1::new();
276            // Header: "<type> <size>\0"
277            hash.update(type_bytes);
278            hash.update(b" ");
279            hash.update(data.len().to_string());
280            hash.update(b"\0");
281
282            // Decompressed data(raw content)
283            hash.update(data);
284
285            let re: [u8; 20] = hash.finalize().into();
286            ObjectHash::Sha1(re)
287        }
288        crate::hash::HashKind::Sha256 => {
289            let mut hash = sha2::Sha256::new();
290            // Header: "<type> <size>\0"
291            hash.update(type_bytes);
292            hash.update(b" ");
293            hash.update(data.len().to_string());
294            hash.update(b"\0");
295
296            // Decompressed data(raw content)
297            hash.update(data);
298            let re: [u8; 32] = hash.finalize().into();
299            ObjectHash::Sha256(re)
300        }
301    }
302}
303/// Create an empty directory or clear the existing directory.
304pub fn create_empty_dir<P: AsRef<Path>>(path: P) -> io::Result<()> {
305    let dir = path.as_ref();
306    // 删除整个文件夹
307    if dir.exists() {
308        fs::remove_dir_all(dir)?;
309    }
310    // 重新创建文件夹
311    fs::create_dir_all(dir)?;
312    Ok(())
313}
314
315/// Count the number of files in a directory and its subdirectories.
316pub fn count_dir_files(path: &Path) -> io::Result<usize> {
317    let mut count = 0;
318    for entry in fs::read_dir(path)? {
319        let entry = entry?;
320        let path = entry.path();
321        if path.is_dir() {
322            count += count_dir_files(&path)?;
323        } else {
324            count += 1;
325        }
326    }
327    Ok(count)
328}
329
330/// Count the time taken to execute a block of code.
331#[macro_export]
332macro_rules! time_it {
333    ($msg:expr, $block:block) => {{
334        let start = std::time::Instant::now();
335        let result = $block;
336        let elapsed = start.elapsed();
337        // println!("{}: {:?}", $msg, elapsed);
338        tracing::info!("{}: {:?}", $msg, elapsed);
339        result
340    }};
341}
342
343#[cfg(test)]
344mod tests {
345    use std::{
346        io,
347        io::{Cursor, Read},
348    };
349
350    use crate::{
351        hash::{HashKind, set_hash_kind_for_test},
352        internal::{object::types::ObjectType, pack::utils::*},
353    };
354
355    #[test]
356    fn test_calc_obj_hash() {
357        let _guard = set_hash_kind_for_test(HashKind::Sha1);
358        let hash = calculate_object_hash(ObjectType::Blob, &b"a".to_vec());
359        assert_eq!(hash.to_string(), "2e65efe2a145dda7ee51d1741299f848e5bf752e");
360    }
361    #[test]
362    fn test_calc_obj_hash_sha256() {
363        let _guard = set_hash_kind_for_test(HashKind::Sha256);
364        let hash = calculate_object_hash(ObjectType::Blob, &b"a".to_vec());
365        assert_eq!(
366            hash.to_string(),
367            "eb337bcee2061c5313c9a1392116b6c76039e9e30d71467ae359b36277e17dc7"
368        );
369    }
370
371    #[test]
372    fn eof() {
373        let mut reader = Cursor::new(&b""[..]);
374        assert!(is_eof(&mut reader));
375    }
376
377    #[test]
378    fn not_eof() {
379        let mut reader = Cursor::new(&b"abc"[..]);
380        assert!(!is_eof(&mut reader));
381    }
382
383    #[test]
384    fn eof_midway() {
385        let mut reader = Cursor::new(&b"abc"[..]);
386        reader.read_exact(&mut [0; 2]).unwrap();
387        assert!(!is_eof(&mut reader));
388    }
389
390    #[test]
391    fn reader_error() {
392        struct BrokenReader;
393        impl Read for BrokenReader {
394            fn read(&mut self, _: &mut [u8]) -> io::Result<usize> {
395                Err(io::Error::other("error"))
396            }
397        }
398
399        let mut reader = BrokenReader;
400        assert!(!is_eof(&mut reader));
401    }
402
403    ///  Test case for a byte without a continuation bit (most significant bit is 0)
404    #[test]
405    fn test_read_byte_and_check_continuation_no_continuation() {
406        let data = [0b0101_0101]; // 85 in binary, highest bit is 0
407        let mut cursor = Cursor::new(data);
408        let (value, more_bytes) = read_byte_and_check_continuation(&mut cursor).unwrap();
409
410        assert_eq!(value, 85); // Expected value is 85
411        assert!(!more_bytes); // No more bytes are expected
412    }
413
414    /// Test case for a byte with a continuation bit (most significant bit is 1)
415    #[test]
416    fn test_read_byte_and_check_continuation_with_continuation() {
417        let data = [0b1010_1010]; // 170 in binary, highest bit is 1
418        let mut cursor = Cursor::new(data);
419        let (value, more_bytes) = read_byte_and_check_continuation(&mut cursor).unwrap();
420
421        assert_eq!(value, 42); // Expected value is 42 (170 - 128)
422        assert!(more_bytes); // More bytes are expected
423    }
424
425    /// Test cases for edge values, like the minimum and maximum byte values
426    #[test]
427    fn test_read_byte_and_check_continuation_edge_cases() {
428        // Test the minimum value (0)
429        let data = [0b0000_0000];
430        let mut cursor = Cursor::new(data);
431        let (value, more_bytes) = read_byte_and_check_continuation(&mut cursor).unwrap();
432
433        assert_eq!(value, 0); // Expected value is 0
434        assert!(!more_bytes); // No more bytes are expected
435
436        // Test the maximum value (255)
437        let data = [0b1111_1111];
438        let mut cursor = Cursor::new(data);
439        let (value, more_bytes) = read_byte_and_check_continuation(&mut cursor).unwrap();
440
441        assert_eq!(value, 127); // Expected value is 127 (255 - 128)
442        assert!(more_bytes); // More bytes are expected
443    }
444
445    /// Test with a single byte where msb is 0 (no continuation)
446    #[test]
447    fn test_single_byte_no_continuation() {
448        let data = [0b0101_0101]; // Type: 5 (101), Size: 5 (0101)
449        let mut offset: usize = 0;
450        let mut cursor = Cursor::new(data);
451        let (type_bits, size) = read_type_and_varint_size(&mut cursor, &mut offset).unwrap();
452
453        assert_eq!(offset, 1); // Offset is 1
454        assert_eq!(type_bits, 5); // Expected type is 2
455        assert_eq!(size, 5); // Expected size is 5
456    }
457
458    /// Test with multiple bytes, where continuation occurs
459    #[test]
460    fn test_multiple_bytes_with_continuation() {
461        // Type: 5 (101), Sizes: 5 (0101), 3 (0000011) in little-endian order
462        let data = [0b1101_0101, 0b0000_0011]; // Second byte's msb is 0
463        let mut offset: usize = 0;
464        let mut cursor = Cursor::new(data);
465        let (type_bits, size) = read_type_and_varint_size(&mut cursor, &mut offset).unwrap();
466
467        assert_eq!(offset, 2); // Offset is 2
468        assert_eq!(type_bits, 5); // Expected type is 5
469        // Expected size 000000110101
470        // 110101  = 1 * 2^5 + 1 * 2^4 + 0 * 2^3 + 1 * 2^2 + 0 * 2^1 + 1 * 2^0= 53
471        assert_eq!(size, 53);
472    }
473
474    /// Test with edge case where size is spread across multiple bytes
475    #[test]
476    fn test_edge_case_size_spread_across_bytes() {
477        // Type: 1 (001), Sizes: 15 (1111) in little-endian order
478        let data = [0b0001_1111, 0b0000_0010]; // Second byte's msb is 1 (continuation)
479        let mut offset: usize = 0;
480        let mut cursor = Cursor::new(data);
481        let (type_bits, size) = read_type_and_varint_size(&mut cursor, &mut offset).unwrap();
482
483        assert_eq!(offset, 1); // Offset is 1
484        assert_eq!(type_bits, 1); // Expected type is 1
485        // Expected size is 15
486        assert_eq!(size, 15);
487    }
488
489    /// Test reading VarInt encoded in little-endian format from a stream.
490    #[test]
491    fn test_read_varint_le_single_byte() {
492        // Single byte: 0x05 (binary: 0000 0101)
493        // Represents the value 5 with no continuation bit set.
494        let data = vec![0x05];
495        let mut cursor = Cursor::new(data);
496        let (value, offset) = read_varint_le(&mut cursor).unwrap();
497
498        assert_eq!(value, 5);
499        assert_eq!(offset, 1);
500    }
501
502    /// Test reading VarInt encoded in little-endian format with multiple bytes from a stream.
503    #[test]
504    fn test_read_varint_le_multiple_bytes() {
505        // Two bytes: 0x85, 0x01 (binary: 1000 0101, 0000 0001)
506        // Represents the value 133. First byte has the continuation bit set.
507        let data = vec![0x85, 0x01];
508        let mut cursor = Cursor::new(data);
509        let (value, offset) = read_varint_le(&mut cursor).unwrap();
510
511        assert_eq!(value, 133);
512        assert_eq!(offset, 2);
513    }
514
515    /// Test reading VarInt encoded in little-endian format with multiple bytes from a stream.
516    #[test]
517    fn test_read_varint_le_large_number() {
518        // Five bytes: 0xFF, 0xFF, 0xFF, 0xFF, 0xF (binary: 1111 1111, 1111 1111, 1111 1111, 1111 1111, 0000 1111)
519        // Represents the value 134,217,727. All continuation bits are set except in the last byte.
520        let data = vec![0xFF, 0xFF, 0xFF, 0xFF, 0xF];
521        let mut cursor = Cursor::new(data);
522        let (value, offset) = read_varint_le(&mut cursor).unwrap();
523
524        assert_eq!(value, 0xFFFFFFFF);
525        assert_eq!(offset, 5);
526    }
527
528    /// Test reading VarInt encoded in little-endian format with zero value.
529    #[test]
530    fn test_read_varint_le_zero() {
531        // Single byte: 0x00 (binary: 0000 0000)
532        // Represents the value 0 with no continuation bit set.
533        let data = vec![0x00];
534        let mut cursor = Cursor::new(data);
535        let (value, offset) = read_varint_le(&mut cursor).unwrap();
536
537        assert_eq!(value, 0);
538        assert_eq!(offset, 1);
539    }
540
541    /// Test reading VarInt encoded in little-endian format that is too long.
542    #[test]
543    fn test_read_varint_le_too_long() {
544        let data = vec![
545            0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x01,
546        ];
547        let mut cursor = Cursor::new(data);
548        let result = read_varint_le(&mut cursor);
549
550        assert!(result.is_err());
551    }
552
553    /// Test reading offset encoding for an OffsetDelta object.
554    #[test]
555    fn test_read_offset_encoding() {
556        let data: Vec<u8> = vec![0b_1101_0101, 0b_0000_0101];
557        let mut cursor = Cursor::new(data);
558        let result = read_offset_encoding(&mut cursor);
559        assert!(result.is_ok());
560        assert_eq!(result.unwrap(), (11013, 2));
561    }
562}