tact_parser/utils.rs
1//! Utility functions for binary operations used in TACT file formats
2
3use crate::jenkins3::hashlittle2;
4use crate::{Error, Result};
5
6/// Perform a [`HashPath`][0] with [`hashlittle2`][] (aka: jenkins3).
7///
8/// This normalises `path` using the same rules as [`SStrHash`][1], and then
9/// merges the two `u32`s of [`hashlittle2`][] into a `u64`, with `pc` as the
10/// high bytes.
11///
12/// [0]: https://wowdev.wiki/TACT#hashpath
13/// [1]: https://wowdev.wiki/SStrHash
14pub fn jenkins3_hashpath(path: &str) -> u64 {
15 let normalised = path.to_ascii_uppercase().replace('/', "\\");
16 let mut pc = 0;
17 let mut pb = 0;
18 hashlittle2(normalised.as_bytes(), &mut pc, &mut pb);
19
20 (u64::from(pc) << 32) | u64::from(pb)
21}
22
23/// Read a 40-bit (5-byte) unsigned integer from a byte slice (little-endian)
24///
25/// 40-bit integers are used throughout TACT formats for file sizes and offsets.
26/// They allow representing values up to 1TB while saving space compared to 64-bit.
27///
28/// # Arguments
29/// * `data` - Byte slice containing at least 5 bytes
30///
31/// # Returns
32/// * The 40-bit value as a u64
33///
34/// # Errors
35/// * Returns error if data contains less than 5 bytes
36///
37/// # Example
38/// ```
39/// use tact_parser::utils::read_uint40;
40///
41/// let data = [0x12, 0x34, 0x56, 0x78, 0x9A];
42/// let value = read_uint40(&data).unwrap();
43/// assert_eq!(value, 0x9A78563412);
44/// ```
45pub fn read_uint40(data: &[u8]) -> Result<u64> {
46 if data.len() < 5 {
47 return Err(Error::IOError(std::io::Error::new(
48 std::io::ErrorKind::UnexpectedEof,
49 format!("Need 5 bytes for uint40, got {}", data.len()),
50 )));
51 }
52
53 Ok((data[0] as u64)
54 | ((data[1] as u64) << 8)
55 | ((data[2] as u64) << 16)
56 | ((data[3] as u64) << 24)
57 | ((data[4] as u64) << 32))
58}
59
60/// Write a 40-bit (5-byte) unsigned integer to a byte array (little-endian)
61///
62/// # Arguments
63/// * `value` - The value to write (must fit in 40 bits)
64///
65/// # Returns
66/// * A 5-byte array containing the value in little-endian format
67///
68/// # Panics
69/// * Panics if value exceeds 40-bit range (>= 2^40)
70///
71/// # Example
72/// ```
73/// use tact_parser::utils::write_uint40;
74///
75/// let bytes = write_uint40(0x9A78563412);
76/// assert_eq!(bytes, [0x12, 0x34, 0x56, 0x78, 0x9A]);
77/// ```
78pub fn write_uint40(value: u64) -> [u8; 5] {
79 assert!(
80 value < (1u64 << 40),
81 "Value {value:#x} exceeds 40-bit range"
82 );
83
84 [
85 (value & 0xFF) as u8,
86 ((value >> 8) & 0xFF) as u8,
87 ((value >> 16) & 0xFF) as u8,
88 ((value >> 24) & 0xFF) as u8,
89 ((value >> 32) & 0xFF) as u8,
90 ]
91}
92
93/// Read a 40-bit unsigned integer from a cursor (little-endian)
94///
95/// This is a convenience function for use with `std::io::Cursor` or `BufReader`.
96///
97/// # Arguments
98/// * `reader` - A reader implementing `std::io::Read`
99///
100/// # Returns
101/// * The 40-bit value as a u64
102///
103/// # Errors
104/// * Returns error if unable to read 5 bytes
105pub fn read_uint40_from<R: std::io::Read>(reader: &mut R) -> Result<u64> {
106 let mut buf = [0u8; 5];
107 reader.read_exact(&mut buf)?;
108 read_uint40(&buf)
109}
110
111/// Read a variable-length integer from a byte slice
112///
113/// Variable-length integers use 7 bits per byte with a continuation bit.
114/// This is compatible with protobuf/varint encoding.
115///
116/// # Arguments
117/// * `data` - Byte slice to read from
118///
119/// # Returns
120/// * Tuple of (value, bytes_consumed)
121///
122/// # Errors
123/// * Returns error if varint is malformed or exceeds 5 bytes
124///
125/// # Example
126/// ```
127/// use tact_parser::utils::read_varint;
128///
129/// let data = [0x08]; // Value 8
130/// let (value, consumed) = read_varint(&data).unwrap();
131/// assert_eq!(value, 8);
132/// assert_eq!(consumed, 1);
133///
134/// let data = [0x96, 0x01]; // Value 150
135/// let (value, consumed) = read_varint(&data).unwrap();
136/// assert_eq!(value, 150);
137/// assert_eq!(consumed, 2);
138/// ```
139pub fn read_varint(data: &[u8]) -> Result<(u32, usize)> {
140 let mut result = 0u32;
141 let mut shift = 0;
142 let mut consumed = 0;
143
144 for &byte in data {
145 consumed += 1;
146
147 // Extract 7-bit value
148 let value = (byte & 0x7F) as u32;
149
150 // Check for overflow
151 if shift >= 32 || (shift == 28 && value > 0x0F) {
152 return Err(Error::IOError(std::io::Error::new(
153 std::io::ErrorKind::InvalidData,
154 "Varint exceeds 32-bit range",
155 )));
156 }
157
158 result |= value << shift;
159
160 // Check continuation bit
161 if byte & 0x80 == 0 {
162 return Ok((result, consumed));
163 }
164
165 shift += 7;
166
167 // Varints shouldn't exceed 5 bytes for 32-bit values
168 if consumed >= 5 {
169 return Err(Error::IOError(std::io::Error::new(
170 std::io::ErrorKind::InvalidData,
171 "Varint exceeds maximum length",
172 )));
173 }
174 }
175
176 Err(Error::IOError(std::io::Error::new(
177 std::io::ErrorKind::UnexpectedEof,
178 "Incomplete varint",
179 )))
180}
181
182/// Write a variable-length integer to a byte vector
183///
184/// # Arguments
185/// * `value` - The value to encode
186///
187/// # Returns
188/// * A vector containing the encoded varint
189///
190/// # Example
191/// ```
192/// use tact_parser::utils::write_varint;
193///
194/// let encoded = write_varint(8);
195/// assert_eq!(encoded, vec![0x08]);
196///
197/// let encoded = write_varint(150);
198/// assert_eq!(encoded, vec![0x96, 0x01]);
199/// ```
200pub fn write_varint(mut value: u32) -> Vec<u8> {
201 let mut result = Vec::new();
202
203 loop {
204 let mut byte = (value & 0x7F) as u8;
205 value >>= 7;
206
207 if value != 0 {
208 byte |= 0x80; // Set continuation bit
209 result.push(byte);
210 } else {
211 result.push(byte);
212 break;
213 }
214 }
215
216 result
217}
218
219/// Read a null-terminated C string from a byte slice
220///
221/// # Arguments
222/// * `data` - Byte slice to read from
223///
224/// # Returns
225/// * Tuple of (string, bytes_consumed)
226///
227/// # Errors
228/// * Returns error if no null terminator found or invalid UTF-8
229pub fn read_cstring(data: &[u8]) -> Result<(String, usize)> {
230 // Find null terminator
231 let null_pos = data.iter().position(|&b| b == 0).ok_or_else(|| {
232 Error::IOError(std::io::Error::new(
233 std::io::ErrorKind::InvalidData,
234 "No null terminator found in C string",
235 ))
236 })?;
237
238 // Convert to string
239 let string = std::str::from_utf8(&data[..null_pos])
240 .map_err(|e| {
241 Error::IOError(std::io::Error::new(
242 std::io::ErrorKind::InvalidData,
243 format!("Invalid UTF-8 in C string: {e}"),
244 ))
245 })?
246 .to_string();
247
248 Ok((string, null_pos + 1)) // +1 for null terminator
249}
250
251/// Read a 40-bit (5-byte) unsigned integer from a byte slice (big-endian)
252///
253/// 40-bit integers are used throughout TACT formats for file sizes and offsets.
254/// They allow representing values up to 1TB while saving space compared to 64-bit.
255///
256/// # Arguments
257/// * `data` - Byte slice containing at least 5 bytes
258///
259/// # Returns
260/// * The 40-bit value as a u64
261///
262/// # Errors
263/// * Returns error if data contains less than 5 bytes
264///
265/// # Example
266/// ```
267/// use tact_parser::utils::read_uint40_be;
268///
269/// let data = [0x01, 0x00, 0x00, 0x00, 0x00]; // 4GB file
270/// let value = read_uint40_be(&data).unwrap();
271/// assert_eq!(value, 0x100000000);
272/// ```
273pub fn read_uint40_be(data: &[u8]) -> Result<u64> {
274 if data.len() < 5 {
275 return Err(Error::IOError(std::io::Error::new(
276 std::io::ErrorKind::UnexpectedEof,
277 format!("Need 5 bytes for uint40, got {}", data.len()),
278 )));
279 }
280
281 // TACT encoding format: 1 byte for high bits (32-39) + 4 bytes big-endian u32 (0-31)
282 let high_byte = data[0] as u64;
283 let low_u32 = u32::from_be_bytes([data[1], data[2], data[3], data[4]]) as u64;
284
285 Ok((high_byte << 32) | low_u32)
286}
287
288/// Write a 40-bit (5-byte) unsigned integer to a byte array (big-endian)
289///
290/// # Arguments
291/// * `value` - The value to write (must fit in 40 bits)
292///
293/// # Returns
294/// * A 5-byte array containing the value in big-endian format
295///
296/// # Panics
297/// * Panics if value exceeds 40-bit range (>= 2^40)
298///
299/// # Example
300/// ```
301/// use tact_parser::utils::write_uint40_be;
302///
303/// let bytes = write_uint40_be(0x100000000); // 4GB
304/// assert_eq!(bytes, [0x01, 0x00, 0x00, 0x00, 0x00]);
305/// ```
306pub fn write_uint40_be(value: u64) -> [u8; 5] {
307 assert!(
308 value < (1u64 << 40),
309 "Value {value:#x} exceeds 40-bit range"
310 );
311
312 // TACT encoding format: 1 byte for high bits (32-39) + 4 bytes big-endian u32 (0-31)
313 let high_byte = ((value >> 32) & 0xFF) as u8;
314 let low_u32 = (value & 0xFFFFFFFF) as u32;
315 let low_bytes = low_u32.to_be_bytes();
316
317 [
318 high_byte,
319 low_bytes[0],
320 low_bytes[1],
321 low_bytes[2],
322 low_bytes[3],
323 ]
324}
325
326/// Read a 40-bit unsigned integer from a cursor (big-endian)
327///
328/// This is a convenience function for use with `std::io::Cursor` or `BufReader`.
329///
330/// # Arguments
331/// * `reader` - A reader implementing `std::io::Read`
332///
333/// # Returns
334/// * The 40-bit value as a u64
335///
336/// # Errors
337/// * Returns error if unable to read 5 bytes
338pub fn read_uint40_be_from<R: std::io::Read>(reader: &mut R) -> Result<u64> {
339 let mut buf = [0u8; 5];
340 reader.read_exact(&mut buf)?;
341 read_uint40_be(&buf)
342}
343
344/// Read a C string from a reader
345///
346/// # Arguments
347/// * `reader` - A reader implementing `std::io::Read`
348///
349/// # Returns
350/// * The string without null terminator
351///
352/// # Errors
353/// * Returns error if unable to read or invalid UTF-8
354pub fn read_cstring_from<R: std::io::Read>(reader: &mut R) -> Result<String> {
355 let mut bytes = Vec::new();
356 let mut byte = [0u8; 1];
357
358 loop {
359 reader.read_exact(&mut byte)?;
360 if byte[0] == 0 {
361 break;
362 }
363 bytes.push(byte[0]);
364 }
365
366 String::from_utf8(bytes).map_err(|e| {
367 Error::IOError(std::io::Error::new(
368 std::io::ErrorKind::InvalidData,
369 format!("Invalid UTF-8 in C string: {e}"),
370 ))
371 })
372}
373
374#[cfg(test)]
375mod tests {
376 use super::*;
377
378 #[test]
379 fn test_uint40_roundtrip() {
380 let test_values = [
381 0u64,
382 1,
383 255,
384 256,
385 65535,
386 65536,
387 0xFFFFFFFF,
388 0x123456789A,
389 0xFFFFFFFFFF, // Max 40-bit value
390 ];
391
392 for value in test_values {
393 let bytes = write_uint40(value);
394 let decoded = read_uint40(&bytes).unwrap();
395 assert_eq!(value, decoded, "Failed for value {value:#x}");
396 }
397 }
398
399 #[test]
400 fn test_uint40_little_endian() {
401 let data = [0x12, 0x34, 0x56, 0x78, 0x9A];
402 let value = read_uint40(&data).unwrap();
403 assert_eq!(value, 0x9A78563412);
404
405 let bytes = write_uint40(0x9A78563412);
406 assert_eq!(bytes, [0x12, 0x34, 0x56, 0x78, 0x9A]);
407 }
408
409 #[test]
410 #[should_panic(expected = "exceeds 40-bit range")]
411 fn test_uint40_overflow() {
412 write_uint40(0x10000000000); // 2^40
413 }
414
415 #[test]
416 fn test_uint40_insufficient_data() {
417 let data = [0x12, 0x34, 0x56, 0x78]; // Only 4 bytes
418 assert!(read_uint40(&data).is_err());
419 }
420
421 #[test]
422 fn test_varint_single_byte() {
423 let data = [0x08];
424 let (value, consumed) = read_varint(&data).unwrap();
425 assert_eq!(value, 8);
426 assert_eq!(consumed, 1);
427
428 let encoded = write_varint(8);
429 assert_eq!(encoded, vec![0x08]);
430 }
431
432 #[test]
433 fn test_varint_multi_byte() {
434 let data = [0x96, 0x01]; // 150 = 0x96
435 let (value, consumed) = read_varint(&data).unwrap();
436 assert_eq!(value, 150);
437 assert_eq!(consumed, 2);
438
439 let encoded = write_varint(150);
440 assert_eq!(encoded, vec![0x96, 0x01]);
441 }
442
443 #[test]
444 fn test_varint_max_value() {
445 let value = 0xFFFFFFFF;
446 let encoded = write_varint(value);
447 let (decoded, _) = read_varint(&encoded).unwrap();
448 assert_eq!(decoded, value);
449 }
450
451 #[test]
452 fn test_varint_known_values() {
453 // Test cases from protobuf spec
454 let test_cases = [
455 (0, vec![0x00]),
456 (1, vec![0x01]),
457 (127, vec![0x7F]),
458 (128, vec![0x80, 0x01]),
459 (300, vec![0xAC, 0x02]),
460 (16384, vec![0x80, 0x80, 0x01]),
461 ];
462
463 for (value, expected) in test_cases {
464 let encoded = write_varint(value);
465 assert_eq!(encoded, expected, "Encoding failed for {value}");
466
467 let (decoded, consumed) = read_varint(&expected).unwrap();
468 assert_eq!(decoded, value, "Decoding failed for {value}");
469 assert_eq!(consumed, expected.len());
470 }
471 }
472
473 #[test]
474 fn test_cstring() {
475 let data = b"Hello, World!\0extra data";
476 let (string, consumed) = read_cstring(data).unwrap();
477 assert_eq!(string, "Hello, World!");
478 assert_eq!(consumed, 14); // Including null terminator
479 }
480
481 #[test]
482 fn test_cstring_empty() {
483 let data = b"\0";
484 let (string, consumed) = read_cstring(data).unwrap();
485 assert_eq!(string, "");
486 assert_eq!(consumed, 1);
487 }
488
489 #[test]
490 fn test_cstring_no_terminator() {
491 let data = b"No null here";
492 assert!(read_cstring(data).is_err());
493 }
494
495 #[test]
496 fn test_uint40_big_endian() {
497 // Test TACT encoding format: 1 high byte + 4 bytes big-endian u32
498 // Example: 4GB file (0x100000000)
499 let data = [0x01, 0x00, 0x00, 0x00, 0x00];
500 let value = read_uint40_be(&data).unwrap();
501 assert_eq!(value, 0x100000000); // 4GB
502
503 // Test a more complex value: 0x0A << 32 | 0x12345678
504 let data = [0x0A, 0x12, 0x34, 0x56, 0x78];
505 let value = read_uint40_be(&data).unwrap();
506 assert_eq!(value, 0x0A12345678);
507
508 // Test round-trip
509 let original = 0x0A12345678u64;
510 let bytes = write_uint40_be(original);
511 let restored = read_uint40_be(&bytes).unwrap();
512 assert_eq!(original, restored);
513 }
514
515 #[test]
516 fn test_uint40_be_from_reader() {
517 use std::io::Cursor;
518
519 let data = [0x01, 0x00, 0x00, 0x00, 0x00]; // 4GB
520 let mut cursor = Cursor::new(&data);
521 let value = read_uint40_be_from(&mut cursor).unwrap();
522 assert_eq!(value, 0x100000000);
523 }
524}