tact_parser/
utils.rs

1//! Utility functions for binary operations used in TACT file formats
2
3use crate::jenkins3::hashlittle2;
4use crate::{Error, Result};
5
6/// Perform a [`HashPath`][0] with [`hashlittle2`][] (aka: jenkins3).
7///
8/// This normalises `path` using the same rules as [`SStrHash`][1], and then
9/// merges the two `u32`s of [`hashlittle2`][] into a `u64`, with `pc` as the
10/// high bytes.
11///
12/// [0]: https://wowdev.wiki/TACT#hashpath
13/// [1]: https://wowdev.wiki/SStrHash
14pub fn jenkins3_hashpath(path: &str) -> u64 {
15    let normalised = path.to_ascii_uppercase().replace('/', "\\");
16    let mut pc = 0;
17    let mut pb = 0;
18    hashlittle2(normalised.as_bytes(), &mut pc, &mut pb);
19
20    (u64::from(pc) << 32) | u64::from(pb)
21}
22
23/// Read a 40-bit (5-byte) unsigned integer from a byte slice (little-endian)
24///
25/// 40-bit integers are used throughout TACT formats for file sizes and offsets.
26/// They allow representing values up to 1TB while saving space compared to 64-bit.
27///
28/// # Arguments
29/// * `data` - Byte slice containing at least 5 bytes
30///
31/// # Returns
32/// * The 40-bit value as a u64
33///
34/// # Errors
35/// * Returns error if data contains less than 5 bytes
36///
37/// # Example
38/// ```
39/// use tact_parser::utils::read_uint40;
40///
41/// let data = [0x12, 0x34, 0x56, 0x78, 0x9A];
42/// let value = read_uint40(&data).unwrap();
43/// assert_eq!(value, 0x9A78563412);
44/// ```
45pub fn read_uint40(data: &[u8]) -> Result<u64> {
46    if data.len() < 5 {
47        return Err(Error::IOError(std::io::Error::new(
48            std::io::ErrorKind::UnexpectedEof,
49            format!("Need 5 bytes for uint40, got {}", data.len()),
50        )));
51    }
52
53    Ok((data[0] as u64)
54        | ((data[1] as u64) << 8)
55        | ((data[2] as u64) << 16)
56        | ((data[3] as u64) << 24)
57        | ((data[4] as u64) << 32))
58}
59
60/// Write a 40-bit (5-byte) unsigned integer to a byte array (little-endian)
61///
62/// # Arguments
63/// * `value` - The value to write (must fit in 40 bits)
64///
65/// # Returns
66/// * A 5-byte array containing the value in little-endian format
67///
68/// # Panics
69/// * Panics if value exceeds 40-bit range (>= 2^40)
70///
71/// # Example
72/// ```
73/// use tact_parser::utils::write_uint40;
74///
75/// let bytes = write_uint40(0x9A78563412);
76/// assert_eq!(bytes, [0x12, 0x34, 0x56, 0x78, 0x9A]);
77/// ```
78pub fn write_uint40(value: u64) -> [u8; 5] {
79    assert!(
80        value < (1u64 << 40),
81        "Value {value:#x} exceeds 40-bit range"
82    );
83
84    [
85        (value & 0xFF) as u8,
86        ((value >> 8) & 0xFF) as u8,
87        ((value >> 16) & 0xFF) as u8,
88        ((value >> 24) & 0xFF) as u8,
89        ((value >> 32) & 0xFF) as u8,
90    ]
91}
92
93/// Read a 40-bit unsigned integer from a cursor (little-endian)
94///
95/// This is a convenience function for use with `std::io::Cursor` or `BufReader`.
96///
97/// # Arguments
98/// * `reader` - A reader implementing `std::io::Read`
99///
100/// # Returns
101/// * The 40-bit value as a u64
102///
103/// # Errors
104/// * Returns error if unable to read 5 bytes
105pub fn read_uint40_from<R: std::io::Read>(reader: &mut R) -> Result<u64> {
106    let mut buf = [0u8; 5];
107    reader.read_exact(&mut buf)?;
108    read_uint40(&buf)
109}
110
111/// Read a variable-length integer from a byte slice
112///
113/// Variable-length integers use 7 bits per byte with a continuation bit.
114/// This is compatible with protobuf/varint encoding.
115///
116/// # Arguments
117/// * `data` - Byte slice to read from
118///
119/// # Returns
120/// * Tuple of (value, bytes_consumed)
121///
122/// # Errors
123/// * Returns error if varint is malformed or exceeds 5 bytes
124///
125/// # Example
126/// ```
127/// use tact_parser::utils::read_varint;
128///
129/// let data = [0x08]; // Value 8
130/// let (value, consumed) = read_varint(&data).unwrap();
131/// assert_eq!(value, 8);
132/// assert_eq!(consumed, 1);
133///
134/// let data = [0x96, 0x01]; // Value 150
135/// let (value, consumed) = read_varint(&data).unwrap();
136/// assert_eq!(value, 150);
137/// assert_eq!(consumed, 2);
138/// ```
139pub fn read_varint(data: &[u8]) -> Result<(u32, usize)> {
140    let mut result = 0u32;
141    let mut shift = 0;
142    let mut consumed = 0;
143
144    for &byte in data {
145        consumed += 1;
146
147        // Extract 7-bit value
148        let value = (byte & 0x7F) as u32;
149
150        // Check for overflow
151        if shift >= 32 || (shift == 28 && value > 0x0F) {
152            return Err(Error::IOError(std::io::Error::new(
153                std::io::ErrorKind::InvalidData,
154                "Varint exceeds 32-bit range",
155            )));
156        }
157
158        result |= value << shift;
159
160        // Check continuation bit
161        if byte & 0x80 == 0 {
162            return Ok((result, consumed));
163        }
164
165        shift += 7;
166
167        // Varints shouldn't exceed 5 bytes for 32-bit values
168        if consumed >= 5 {
169            return Err(Error::IOError(std::io::Error::new(
170                std::io::ErrorKind::InvalidData,
171                "Varint exceeds maximum length",
172            )));
173        }
174    }
175
176    Err(Error::IOError(std::io::Error::new(
177        std::io::ErrorKind::UnexpectedEof,
178        "Incomplete varint",
179    )))
180}
181
182/// Write a variable-length integer to a byte vector
183///
184/// # Arguments
185/// * `value` - The value to encode
186///
187/// # Returns
188/// * A vector containing the encoded varint
189///
190/// # Example
191/// ```
192/// use tact_parser::utils::write_varint;
193///
194/// let encoded = write_varint(8);
195/// assert_eq!(encoded, vec![0x08]);
196///
197/// let encoded = write_varint(150);
198/// assert_eq!(encoded, vec![0x96, 0x01]);
199/// ```
200pub fn write_varint(mut value: u32) -> Vec<u8> {
201    let mut result = Vec::new();
202
203    loop {
204        let mut byte = (value & 0x7F) as u8;
205        value >>= 7;
206
207        if value != 0 {
208            byte |= 0x80; // Set continuation bit
209            result.push(byte);
210        } else {
211            result.push(byte);
212            break;
213        }
214    }
215
216    result
217}
218
219/// Read a null-terminated C string from a byte slice
220///
221/// # Arguments
222/// * `data` - Byte slice to read from
223///
224/// # Returns
225/// * Tuple of (string, bytes_consumed)
226///
227/// # Errors
228/// * Returns error if no null terminator found or invalid UTF-8
229pub fn read_cstring(data: &[u8]) -> Result<(String, usize)> {
230    // Find null terminator
231    let null_pos = data.iter().position(|&b| b == 0).ok_or_else(|| {
232        Error::IOError(std::io::Error::new(
233            std::io::ErrorKind::InvalidData,
234            "No null terminator found in C string",
235        ))
236    })?;
237
238    // Convert to string
239    let string = std::str::from_utf8(&data[..null_pos])
240        .map_err(|e| {
241            Error::IOError(std::io::Error::new(
242                std::io::ErrorKind::InvalidData,
243                format!("Invalid UTF-8 in C string: {e}"),
244            ))
245        })?
246        .to_string();
247
248    Ok((string, null_pos + 1)) // +1 for null terminator
249}
250
251/// Read a C string from a reader
252///
253/// # Arguments
254/// * `reader` - A reader implementing `std::io::Read`
255///
256/// # Returns
257/// * The string without null terminator
258///
259/// # Errors
260/// * Returns error if unable to read or invalid UTF-8
261pub fn read_cstring_from<R: std::io::Read>(reader: &mut R) -> Result<String> {
262    let mut bytes = Vec::new();
263    let mut byte = [0u8; 1];
264
265    loop {
266        reader.read_exact(&mut byte)?;
267        if byte[0] == 0 {
268            break;
269        }
270        bytes.push(byte[0]);
271    }
272
273    String::from_utf8(bytes).map_err(|e| {
274        Error::IOError(std::io::Error::new(
275            std::io::ErrorKind::InvalidData,
276            format!("Invalid UTF-8 in C string: {e}"),
277        ))
278    })
279}
280
281#[cfg(test)]
282mod tests {
283    use super::*;
284
285    #[test]
286    fn test_uint40_roundtrip() {
287        let test_values = [
288            0u64,
289            1,
290            255,
291            256,
292            65535,
293            65536,
294            0xFFFFFFFF,
295            0x123456789A,
296            0xFFFFFFFFFF, // Max 40-bit value
297        ];
298
299        for value in test_values {
300            let bytes = write_uint40(value);
301            let decoded = read_uint40(&bytes).unwrap();
302            assert_eq!(value, decoded, "Failed for value {value:#x}");
303        }
304    }
305
306    #[test]
307    fn test_uint40_little_endian() {
308        let data = [0x12, 0x34, 0x56, 0x78, 0x9A];
309        let value = read_uint40(&data).unwrap();
310        assert_eq!(value, 0x9A78563412);
311
312        let bytes = write_uint40(0x9A78563412);
313        assert_eq!(bytes, [0x12, 0x34, 0x56, 0x78, 0x9A]);
314    }
315
316    #[test]
317    #[should_panic(expected = "exceeds 40-bit range")]
318    fn test_uint40_overflow() {
319        write_uint40(0x10000000000); // 2^40
320    }
321
322    #[test]
323    fn test_uint40_insufficient_data() {
324        let data = [0x12, 0x34, 0x56, 0x78]; // Only 4 bytes
325        assert!(read_uint40(&data).is_err());
326    }
327
328    #[test]
329    fn test_varint_single_byte() {
330        let data = [0x08];
331        let (value, consumed) = read_varint(&data).unwrap();
332        assert_eq!(value, 8);
333        assert_eq!(consumed, 1);
334
335        let encoded = write_varint(8);
336        assert_eq!(encoded, vec![0x08]);
337    }
338
339    #[test]
340    fn test_varint_multi_byte() {
341        let data = [0x96, 0x01]; // 150 = 0x96
342        let (value, consumed) = read_varint(&data).unwrap();
343        assert_eq!(value, 150);
344        assert_eq!(consumed, 2);
345
346        let encoded = write_varint(150);
347        assert_eq!(encoded, vec![0x96, 0x01]);
348    }
349
350    #[test]
351    fn test_varint_max_value() {
352        let value = 0xFFFFFFFF;
353        let encoded = write_varint(value);
354        let (decoded, _) = read_varint(&encoded).unwrap();
355        assert_eq!(decoded, value);
356    }
357
358    #[test]
359    fn test_varint_known_values() {
360        // Test cases from protobuf spec
361        let test_cases = [
362            (0, vec![0x00]),
363            (1, vec![0x01]),
364            (127, vec![0x7F]),
365            (128, vec![0x80, 0x01]),
366            (300, vec![0xAC, 0x02]),
367            (16384, vec![0x80, 0x80, 0x01]),
368        ];
369
370        for (value, expected) in test_cases {
371            let encoded = write_varint(value);
372            assert_eq!(encoded, expected, "Encoding failed for {value}");
373
374            let (decoded, consumed) = read_varint(&expected).unwrap();
375            assert_eq!(decoded, value, "Decoding failed for {value}");
376            assert_eq!(consumed, expected.len());
377        }
378    }
379
380    #[test]
381    fn test_cstring() {
382        let data = b"Hello, World!\0extra data";
383        let (string, consumed) = read_cstring(data).unwrap();
384        assert_eq!(string, "Hello, World!");
385        assert_eq!(consumed, 14); // Including null terminator
386    }
387
388    #[test]
389    fn test_cstring_empty() {
390        let data = b"\0";
391        let (string, consumed) = read_cstring(data).unwrap();
392        assert_eq!(string, "");
393        assert_eq!(consumed, 1);
394    }
395
396    #[test]
397    fn test_cstring_no_terminator() {
398        let data = b"No null here";
399        assert!(read_cstring(data).is_err());
400    }
401}