tact_parser/
utils.rs

1//! Utility functions for binary operations used in TACT file formats
2
3use crate::jenkins3::hashlittle2;
4use crate::{Error, Result};
5
6/// Perform a [`HashPath`][0] with [`hashlittle2`][] (aka: jenkins3).
7///
8/// This normalises `path` using the same rules as [`SStrHash`][1], and then
9/// merges the two `u32`s of [`hashlittle2`][] into a `u64`, with `pc` as the
10/// high bytes.
11///
12/// [0]: https://wowdev.wiki/TACT#hashpath
13/// [1]: https://wowdev.wiki/SStrHash
14pub fn jenkins3_hashpath(path: &str) -> u64 {
15    let normalised = path.to_ascii_uppercase().replace('/', "\\");
16    let mut pc = 0;
17    let mut pb = 0;
18    hashlittle2(normalised.as_bytes(), &mut pc, &mut pb);
19
20    (u64::from(pc) << 32) | u64::from(pb)
21}
22
23/// Read a 40-bit (5-byte) unsigned integer from a byte slice (little-endian)
24///
25/// 40-bit integers are used throughout TACT formats for file sizes and offsets.
26/// They allow representing values up to 1TB while saving space compared to 64-bit.
27///
28/// # Arguments
29/// * `data` - Byte slice containing at least 5 bytes
30///
31/// # Returns
32/// * The 40-bit value as a u64
33///
34/// # Errors
35/// * Returns error if data contains less than 5 bytes
36///
37/// # Example
38/// ```
39/// use tact_parser::utils::read_uint40;
40///
41/// let data = [0x12, 0x34, 0x56, 0x78, 0x9A];
42/// let value = read_uint40(&data).unwrap();
43/// assert_eq!(value, 0x9A78563412);
44/// ```
45pub fn read_uint40(data: &[u8]) -> Result<u64> {
46    if data.len() < 5 {
47        return Err(Error::IOError(std::io::Error::new(
48            std::io::ErrorKind::UnexpectedEof,
49            format!("Need 5 bytes for uint40, got {}", data.len()),
50        )));
51    }
52
53    Ok((data[0] as u64)
54        | ((data[1] as u64) << 8)
55        | ((data[2] as u64) << 16)
56        | ((data[3] as u64) << 24)
57        | ((data[4] as u64) << 32))
58}
59
60/// Write a 40-bit (5-byte) unsigned integer to a byte array (little-endian)
61///
62/// # Arguments
63/// * `value` - The value to write (must fit in 40 bits)
64///
65/// # Returns
66/// * A 5-byte array containing the value in little-endian format
67///
68/// # Panics
69/// * Panics if value exceeds 40-bit range (>= 2^40)
70///
71/// # Example
72/// ```
73/// use tact_parser::utils::write_uint40;
74///
75/// let bytes = write_uint40(0x9A78563412);
76/// assert_eq!(bytes, [0x12, 0x34, 0x56, 0x78, 0x9A]);
77/// ```
78pub fn write_uint40(value: u64) -> [u8; 5] {
79    assert!(
80        value < (1u64 << 40),
81        "Value {value:#x} exceeds 40-bit range"
82    );
83
84    [
85        (value & 0xFF) as u8,
86        ((value >> 8) & 0xFF) as u8,
87        ((value >> 16) & 0xFF) as u8,
88        ((value >> 24) & 0xFF) as u8,
89        ((value >> 32) & 0xFF) as u8,
90    ]
91}
92
93/// Read a 40-bit unsigned integer from a cursor (little-endian)
94///
95/// This is a convenience function for use with `std::io::Cursor` or `BufReader`.
96///
97/// # Arguments
98/// * `reader` - A reader implementing `std::io::Read`
99///
100/// # Returns
101/// * The 40-bit value as a u64
102///
103/// # Errors
104/// * Returns error if unable to read 5 bytes
105pub fn read_uint40_from<R: std::io::Read>(reader: &mut R) -> Result<u64> {
106    let mut buf = [0u8; 5];
107    reader.read_exact(&mut buf)?;
108    read_uint40(&buf)
109}
110
111/// Read a variable-length integer from a byte slice
112///
113/// Variable-length integers use 7 bits per byte with a continuation bit.
114/// This is compatible with protobuf/varint encoding.
115///
116/// # Arguments
117/// * `data` - Byte slice to read from
118///
119/// # Returns
120/// * Tuple of (value, bytes_consumed)
121///
122/// # Errors
123/// * Returns error if varint is malformed or exceeds 5 bytes
124///
125/// # Example
126/// ```
127/// use tact_parser::utils::read_varint;
128///
129/// let data = [0x08]; // Value 8
130/// let (value, consumed) = read_varint(&data).unwrap();
131/// assert_eq!(value, 8);
132/// assert_eq!(consumed, 1);
133///
134/// let data = [0x96, 0x01]; // Value 150
135/// let (value, consumed) = read_varint(&data).unwrap();
136/// assert_eq!(value, 150);
137/// assert_eq!(consumed, 2);
138/// ```
139pub fn read_varint(data: &[u8]) -> Result<(u32, usize)> {
140    let mut result = 0u32;
141    let mut shift = 0;
142    let mut consumed = 0;
143
144    for &byte in data {
145        consumed += 1;
146
147        // Extract 7-bit value
148        let value = (byte & 0x7F) as u32;
149
150        // Check for overflow
151        if shift >= 32 || (shift == 28 && value > 0x0F) {
152            return Err(Error::IOError(std::io::Error::new(
153                std::io::ErrorKind::InvalidData,
154                "Varint exceeds 32-bit range",
155            )));
156        }
157
158        result |= value << shift;
159
160        // Check continuation bit
161        if byte & 0x80 == 0 {
162            return Ok((result, consumed));
163        }
164
165        shift += 7;
166
167        // Varints shouldn't exceed 5 bytes for 32-bit values
168        if consumed >= 5 {
169            return Err(Error::IOError(std::io::Error::new(
170                std::io::ErrorKind::InvalidData,
171                "Varint exceeds maximum length",
172            )));
173        }
174    }
175
176    Err(Error::IOError(std::io::Error::new(
177        std::io::ErrorKind::UnexpectedEof,
178        "Incomplete varint",
179    )))
180}
181
182/// Write a variable-length integer to a byte vector
183///
184/// # Arguments
185/// * `value` - The value to encode
186///
187/// # Returns
188/// * A vector containing the encoded varint
189///
190/// # Example
191/// ```
192/// use tact_parser::utils::write_varint;
193///
194/// let encoded = write_varint(8);
195/// assert_eq!(encoded, vec![0x08]);
196///
197/// let encoded = write_varint(150);
198/// assert_eq!(encoded, vec![0x96, 0x01]);
199/// ```
200pub fn write_varint(mut value: u32) -> Vec<u8> {
201    let mut result = Vec::new();
202
203    loop {
204        let mut byte = (value & 0x7F) as u8;
205        value >>= 7;
206
207        if value != 0 {
208            byte |= 0x80; // Set continuation bit
209            result.push(byte);
210        } else {
211            result.push(byte);
212            break;
213        }
214    }
215
216    result
217}
218
219/// Read a null-terminated C string from a byte slice
220///
221/// # Arguments
222/// * `data` - Byte slice to read from
223///
224/// # Returns
225/// * Tuple of (string, bytes_consumed)
226///
227/// # Errors
228/// * Returns error if no null terminator found or invalid UTF-8
229pub fn read_cstring(data: &[u8]) -> Result<(String, usize)> {
230    // Find null terminator
231    let null_pos = data.iter().position(|&b| b == 0).ok_or_else(|| {
232        Error::IOError(std::io::Error::new(
233            std::io::ErrorKind::InvalidData,
234            "No null terminator found in C string",
235        ))
236    })?;
237
238    // Convert to string
239    let string = std::str::from_utf8(&data[..null_pos])
240        .map_err(|e| {
241            Error::IOError(std::io::Error::new(
242                std::io::ErrorKind::InvalidData,
243                format!("Invalid UTF-8 in C string: {e}"),
244            ))
245        })?
246        .to_string();
247
248    Ok((string, null_pos + 1)) // +1 for null terminator
249}
250
251/// Read a 40-bit (5-byte) unsigned integer from a byte slice (big-endian)
252///
253/// 40-bit integers are used throughout TACT formats for file sizes and offsets.
254/// They allow representing values up to 1TB while saving space compared to 64-bit.
255///
256/// # Arguments
257/// * `data` - Byte slice containing at least 5 bytes
258///
259/// # Returns
260/// * The 40-bit value as a u64
261///
262/// # Errors
263/// * Returns error if data contains less than 5 bytes
264///
265/// # Example
266/// ```
267/// use tact_parser::utils::read_uint40_be;
268///
269/// let data = [0x01, 0x00, 0x00, 0x00, 0x00]; // 4GB file
270/// let value = read_uint40_be(&data).unwrap();
271/// assert_eq!(value, 0x100000000);
272/// ```
273pub fn read_uint40_be(data: &[u8]) -> Result<u64> {
274    if data.len() < 5 {
275        return Err(Error::IOError(std::io::Error::new(
276            std::io::ErrorKind::UnexpectedEof,
277            format!("Need 5 bytes for uint40, got {}", data.len()),
278        )));
279    }
280
281    // TACT encoding format: 1 byte for high bits (32-39) + 4 bytes big-endian u32 (0-31)
282    let high_byte = data[0] as u64;
283    let low_u32 = u32::from_be_bytes([data[1], data[2], data[3], data[4]]) as u64;
284
285    Ok((high_byte << 32) | low_u32)
286}
287
288/// Write a 40-bit (5-byte) unsigned integer to a byte array (big-endian)
289///
290/// # Arguments
291/// * `value` - The value to write (must fit in 40 bits)
292///
293/// # Returns
294/// * A 5-byte array containing the value in big-endian format
295///
296/// # Panics
297/// * Panics if value exceeds 40-bit range (>= 2^40)
298///
299/// # Example
300/// ```
301/// use tact_parser::utils::write_uint40_be;
302///
303/// let bytes = write_uint40_be(0x100000000); // 4GB
304/// assert_eq!(bytes, [0x01, 0x00, 0x00, 0x00, 0x00]);
305/// ```
306pub fn write_uint40_be(value: u64) -> [u8; 5] {
307    assert!(
308        value < (1u64 << 40),
309        "Value {value:#x} exceeds 40-bit range"
310    );
311
312    // TACT encoding format: 1 byte for high bits (32-39) + 4 bytes big-endian u32 (0-31)
313    let high_byte = ((value >> 32) & 0xFF) as u8;
314    let low_u32 = (value & 0xFFFFFFFF) as u32;
315    let low_bytes = low_u32.to_be_bytes();
316
317    [
318        high_byte,
319        low_bytes[0],
320        low_bytes[1],
321        low_bytes[2],
322        low_bytes[3],
323    ]
324}
325
326/// Read a 40-bit unsigned integer from a cursor (big-endian)
327///
328/// This is a convenience function for use with `std::io::Cursor` or `BufReader`.
329///
330/// # Arguments
331/// * `reader` - A reader implementing `std::io::Read`
332///
333/// # Returns
334/// * The 40-bit value as a u64
335///
336/// # Errors
337/// * Returns error if unable to read 5 bytes
338pub fn read_uint40_be_from<R: std::io::Read>(reader: &mut R) -> Result<u64> {
339    let mut buf = [0u8; 5];
340    reader.read_exact(&mut buf)?;
341    read_uint40_be(&buf)
342}
343
344/// Read a C string from a reader
345///
346/// # Arguments
347/// * `reader` - A reader implementing `std::io::Read`
348///
349/// # Returns
350/// * The string without null terminator
351///
352/// # Errors
353/// * Returns error if unable to read or invalid UTF-8
354pub fn read_cstring_from<R: std::io::Read>(reader: &mut R) -> Result<String> {
355    let mut bytes = Vec::new();
356    let mut byte = [0u8; 1];
357
358    loop {
359        reader.read_exact(&mut byte)?;
360        if byte[0] == 0 {
361            break;
362        }
363        bytes.push(byte[0]);
364    }
365
366    String::from_utf8(bytes).map_err(|e| {
367        Error::IOError(std::io::Error::new(
368            std::io::ErrorKind::InvalidData,
369            format!("Invalid UTF-8 in C string: {e}"),
370        ))
371    })
372}
373
374#[cfg(test)]
375mod tests {
376    use super::*;
377
378    #[test]
379    fn test_uint40_roundtrip() {
380        let test_values = [
381            0u64,
382            1,
383            255,
384            256,
385            65535,
386            65536,
387            0xFFFFFFFF,
388            0x123456789A,
389            0xFFFFFFFFFF, // Max 40-bit value
390        ];
391
392        for value in test_values {
393            let bytes = write_uint40(value);
394            let decoded = read_uint40(&bytes).unwrap();
395            assert_eq!(value, decoded, "Failed for value {value:#x}");
396        }
397    }
398
399    #[test]
400    fn test_uint40_little_endian() {
401        let data = [0x12, 0x34, 0x56, 0x78, 0x9A];
402        let value = read_uint40(&data).unwrap();
403        assert_eq!(value, 0x9A78563412);
404
405        let bytes = write_uint40(0x9A78563412);
406        assert_eq!(bytes, [0x12, 0x34, 0x56, 0x78, 0x9A]);
407    }
408
409    #[test]
410    #[should_panic(expected = "exceeds 40-bit range")]
411    fn test_uint40_overflow() {
412        write_uint40(0x10000000000); // 2^40
413    }
414
415    #[test]
416    fn test_uint40_insufficient_data() {
417        let data = [0x12, 0x34, 0x56, 0x78]; // Only 4 bytes
418        assert!(read_uint40(&data).is_err());
419    }
420
421    #[test]
422    fn test_varint_single_byte() {
423        let data = [0x08];
424        let (value, consumed) = read_varint(&data).unwrap();
425        assert_eq!(value, 8);
426        assert_eq!(consumed, 1);
427
428        let encoded = write_varint(8);
429        assert_eq!(encoded, vec![0x08]);
430    }
431
432    #[test]
433    fn test_varint_multi_byte() {
434        let data = [0x96, 0x01]; // 150 = 0x96
435        let (value, consumed) = read_varint(&data).unwrap();
436        assert_eq!(value, 150);
437        assert_eq!(consumed, 2);
438
439        let encoded = write_varint(150);
440        assert_eq!(encoded, vec![0x96, 0x01]);
441    }
442
443    #[test]
444    fn test_varint_max_value() {
445        let value = 0xFFFFFFFF;
446        let encoded = write_varint(value);
447        let (decoded, _) = read_varint(&encoded).unwrap();
448        assert_eq!(decoded, value);
449    }
450
451    #[test]
452    fn test_varint_known_values() {
453        // Test cases from protobuf spec
454        let test_cases = [
455            (0, vec![0x00]),
456            (1, vec![0x01]),
457            (127, vec![0x7F]),
458            (128, vec![0x80, 0x01]),
459            (300, vec![0xAC, 0x02]),
460            (16384, vec![0x80, 0x80, 0x01]),
461        ];
462
463        for (value, expected) in test_cases {
464            let encoded = write_varint(value);
465            assert_eq!(encoded, expected, "Encoding failed for {value}");
466
467            let (decoded, consumed) = read_varint(&expected).unwrap();
468            assert_eq!(decoded, value, "Decoding failed for {value}");
469            assert_eq!(consumed, expected.len());
470        }
471    }
472
473    #[test]
474    fn test_cstring() {
475        let data = b"Hello, World!\0extra data";
476        let (string, consumed) = read_cstring(data).unwrap();
477        assert_eq!(string, "Hello, World!");
478        assert_eq!(consumed, 14); // Including null terminator
479    }
480
481    #[test]
482    fn test_cstring_empty() {
483        let data = b"\0";
484        let (string, consumed) = read_cstring(data).unwrap();
485        assert_eq!(string, "");
486        assert_eq!(consumed, 1);
487    }
488
489    #[test]
490    fn test_cstring_no_terminator() {
491        let data = b"No null here";
492        assert!(read_cstring(data).is_err());
493    }
494
495    #[test]
496    fn test_uint40_big_endian() {
497        // Test TACT encoding format: 1 high byte + 4 bytes big-endian u32
498        // Example: 4GB file (0x100000000)
499        let data = [0x01, 0x00, 0x00, 0x00, 0x00];
500        let value = read_uint40_be(&data).unwrap();
501        assert_eq!(value, 0x100000000); // 4GB
502
503        // Test a more complex value: 0x0A << 32 | 0x12345678
504        let data = [0x0A, 0x12, 0x34, 0x56, 0x78];
505        let value = read_uint40_be(&data).unwrap();
506        assert_eq!(value, 0x0A12345678);
507
508        // Test round-trip
509        let original = 0x0A12345678u64;
510        let bytes = write_uint40_be(original);
511        let restored = read_uint40_be(&bytes).unwrap();
512        assert_eq!(original, restored);
513    }
514
515    #[test]
516    fn test_uint40_be_from_reader() {
517        use std::io::Cursor;
518
519        let data = [0x01, 0x00, 0x00, 0x00, 0x00]; // 4GB
520        let mut cursor = Cursor::new(&data);
521        let value = read_uint40_be_from(&mut cursor).unwrap();
522        assert_eq!(value, 0x100000000);
523    }
524}