dar/lib.rs
1//! Pure-Rust reader for Denis Corbin DAR (Disk ARchiver) archives.
2//!
3//! Supports DAR formats 7–11 (produced by dar 2.3–2.8) and the legacy ≤7 grammar.
4//! Passware Kit Mobile produces format-9 archives; dar 2.8.5 produces 11.3.
5//! Entries and the catalogue compressed with gzip, bzip2, xz, zstd, lz4 or lzo
6//! are transparently decompressed (pure-Rust; each an optional feature, all on by
7//! default); encryption is not decoded.
8//!
9//! ## Format sketch
10//!
11//! ```text
12//! Slice header:
13//! [4] magic = 00 00 00 7b (SAUV_MAGIC_NUMBER = 123, big-endian u32)
14//! [10] internal_name label
15//! [1] flag [1] ext_char
16//! TLV list: infinint(count) + count × (u16 type + infinint len + data)
17//! ← archive_origin: all catalog archive_offset values are relative to here
18//!
19//! Archive body:
20//! escaped sequences (seqt_file, seqt_saved, …) + raw file bytes
21//!
22//! Catalog (located by seqt_catalogue escape: AD FD EA 77 21 43):
23//! [10] label + (NUL working-dir path, format 11.1+ only) + entries
24//!
25//! Each entry: cat_sig byte where (cat_sig & 0x1f | 0x60) gives type
26//! 'd' directory → NUL-name + inode [+ FSA] (push to dir stack)
27//! 'f' file → NUL-name + inode [+ FSA] + file-specific fields
28//! 'z' EOD → pop dir stack; depth=0 → done
29//! ```
30//!
31//! ## Key non-obvious invariants
32//!
33//! - **Infinint**: variable-length. The common form is 5 bytes
34//! (`0x80 XX XX XX XX`, a big-endian u32); timestamps past 2^32 use the
35//! 9-byte `0x40` form (big-endian u64). Encodings wider than 64 bits are
36//! rejected as corrupt — this reader decodes to `u64` or errors, never
37//! truncates.
38//! - **Permissions**: 2-byte big-endian u16, *not* an infinint.
39//! - **Timestamps**: format 8 stores a bare seconds infinint; format 9+ prefix
40//! a unit byte (`'s'`/`'u'`/`'n'`) and add a sub-second infinint for `'u'`/`'n'`.
41//! - **FSA** (format 9+ only): inode flag bit `0x10` (FSA-full) adds inode
42//! infinints and an FSA block; format 8 has no FSA.
43//! - **archive_offset**: points *directly* to the raw file bytes, not to the
44//! data-section header that precedes them in the body stream.
45//! `seek(archive_origin + archive_offset)` then `read(stored_size)`.
46//!
47//! Full format notes: `docs/implementation-notes.md`.
48
49// Production code is panic-free (no unwrap/expect, enforced by the workspace
50// lints); tests legitimately use them.
51#![cfg_attr(test, allow(clippy::unwrap_used, clippy::expect_used))]
52
53use std::fs::File;
54use std::io::{Cursor, Read, Seek, SeekFrom, Write};
55use std::path::{Path, PathBuf};
56
57use thiserror::Error;
58
59/// `00 00 00 7b` — DAR magic (SAUV_MAGIC_NUMBER = 123, big-endian u32).
60const DAR_MAGIC: [u8; 4] = [0x00, 0x00, 0x00, 0x7b];
61
62/// Upper bound on the compressed catalogue bytes read from the archive tail and
63/// on the inflated catalogue, guarding against a decompression bomb (per-file
64/// streams need no such constant — they are bounded by the entry's known size).
65const MAX_CATALOGUE_COMPRESSED: u64 = 512 * 1024 * 1024;
66const MAX_CATALOGUE_INFLATED: u64 = 1024 * 1024 * 1024;
67
68/// Upper bound on a per-file CRC width (libdar uses 4 bytes per gigabyte, so
69/// 64 KiB covers a 16 TiB file); a larger declared width is treated as corrupt.
70const MAX_CRC_SIZE: u64 = 64 * 1024;
71
72/// Upper bound on the per-block uncompressed block size (`compr_bs`); a header
73/// declaring more is treated as not block-compressed (allocation-bomb guard).
74/// dar's default is 240 KiB; 256 MiB is far beyond any practical setting.
75const MAX_BLOCK_SIZE: u64 = 256 * 1024 * 1024;
76
77/// Escape sequence marking the catalog: `AD FD EA 77 21 43`.
78const SEQT_CATALOGUE: [u8; 6] = [0xAD, 0xFD, 0xEA, 0x77, 0x21, 0x43];
79
80/// First archive format with an in-place (working-directory) path in the
81/// catalog header — `archive_version(11,1)` → `value() = 11*256 + 1`.
82/// Formats 8, 9, 10 and 11.0 have no such field.
83const FORMAT_11_1: u32 = 11 * 256 + 1;
84
85/// Errors returned by [`DarReader`].
86#[derive(Debug, Error)]
87pub enum DarError {
88 #[error("I/O error: {0}")]
89 Io(#[from] std::io::Error),
90 #[error("not a DAR archive")]
91 NotADar,
92 #[error("corrupt archive: {0}")]
93 Corrupt(String),
94 #[error("entry not found: '{0}'")]
95 EntryNotFound(String),
96}
97
98/// Outcome of verifying a file entry's stored CRC against its decompressed data
99/// (see [`DarReader::verify`]). CRC values are lowercase hex.
100#[derive(Debug, Clone, PartialEq, Eq)]
101#[cfg_attr(feature = "serde", derive(serde::Serialize))]
102pub enum CrcStatus {
103 /// The stored CRC matches the data.
104 Match,
105 /// The stored CRC disagrees with the data — consistent with corruption or
106 /// tampering of the archived bytes.
107 Mismatch {
108 /// CRC recorded in the catalogue (lowercase hex).
109 stored: String,
110 /// CRC computed over the decompressed data (lowercase hex).
111 computed: String,
112 },
113 /// No CRC is stored for this entry (edition-1 archives record none), so
114 /// integrity cannot be checked.
115 NotStored,
116}
117
118impl core::fmt::Display for CrcStatus {
119 fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
120 match self {
121 CrcStatus::Match => f.write_str("CRC match"),
122 CrcStatus::Mismatch { stored, computed } => {
123 write!(f, "CRC mismatch: stored {stored}, computed {computed}")
124 }
125 CrcStatus::NotStored => f.write_str("no CRC stored"),
126 }
127 }
128}
129
130/// The kind of filesystem object a catalog entry describes.
131#[derive(Debug, Clone, Copy, PartialEq, Eq)]
132#[cfg_attr(feature = "serde", derive(serde::Serialize))]
133pub enum EntryKind {
134 File,
135 Directory,
136 Symlink,
137 NamedPipe,
138 Socket,
139 CharDevice,
140 BlockDevice,
141 Hardlink,
142 /// A catalog entry type this reader does not model (the raw `cat_sig` letter).
143 Unknown(char),
144}
145
146/// Metadata about one archived filesystem object.
147///
148/// Paths and symlink targets are exposed as raw bytes — DAR (like the
149/// filesystems it archives) does not guarantee UTF-8, and a forensic reader
150/// must never lose or reject a byte-exact name. Use [`DarEntry::path_lossy`] for
151/// display.
152#[derive(Debug, Clone)]
153#[cfg_attr(feature = "serde", derive(serde::Serialize))]
154pub struct DarEntry {
155 /// Path as stored, raw bytes — may not be valid UTF-8. In JSON this is the
156 /// lossy-UTF-8 display string (use the field directly for byte-exact data).
157 #[cfg_attr(feature = "serde", serde(serialize_with = "serialize_bytes_lossy"))]
158 pub path: Vec<u8>,
159 /// What kind of filesystem object this entry describes.
160 pub kind: EntryKind,
161 /// Uncompressed size in bytes (0 for entries with no data).
162 pub size: u64,
163 /// Owner user id.
164 pub uid: u64,
165 /// Owner group id.
166 pub gid: u64,
167 /// Permission bits (the low bits of the mode).
168 pub mode: u16,
169 /// Access time, seconds since the Unix epoch.
170 pub atime: i64,
171 /// Modification time, seconds since the Unix epoch.
172 pub mtime: i64,
173 /// Status-change time, seconds since the Unix epoch; `None` for formats
174 /// before 8, which do not record it.
175 pub ctime: Option<i64>,
176 /// Target of a symbolic link, raw bytes; `None` for non-symlinks. In JSON
177 /// this is the lossy-UTF-8 display string (or null).
178 #[cfg_attr(feature = "serde", serde(serialize_with = "serialize_opt_bytes_lossy"))]
179 pub symlink_target: Option<Vec<u8>>,
180}
181
182impl DarEntry {
183 /// The path decoded as lossy UTF-8 (invalid byte sequences become U+FFFD).
184 #[must_use]
185 pub fn path_lossy(&self) -> std::borrow::Cow<'_, str> {
186 String::from_utf8_lossy(&self.path)
187 }
188}
189
190/// Serialize raw path/target bytes as a lossy-UTF-8 string for JSON export.
191/// The byte-exact value remains available via the typed field; this is a
192/// human-readable display projection (serde_json escapes control characters).
193#[cfg(feature = "serde")]
194fn serialize_bytes_lossy<S: serde::Serializer>(bytes: &[u8], s: S) -> Result<S::Ok, S::Error> {
195 s.serialize_str(&String::from_utf8_lossy(bytes))
196}
197
198// serde's `serialize_with` calls this with `&self.field`, so the signature must
199// take `&Option<_>` (not `Option<&_>`); the lint does not apply here.
200#[cfg(feature = "serde")]
201#[allow(clippy::ref_option)]
202fn serialize_opt_bytes_lossy<S: serde::Serializer>(
203 target: &Option<Vec<u8>>,
204 s: S,
205) -> Result<S::Ok, S::Error> {
206 match target {
207 Some(bytes) => s.serialize_some(&String::from_utf8_lossy(bytes)),
208 None => s.serialize_none(),
209 }
210}
211
212#[derive(Debug, Clone)]
213struct EntryRef {
214 path: Vec<u8>,
215 kind: EntryKind,
216 size: u64,
217 uid: u64,
218 gid: u64,
219 mode: u16,
220 atime: i64,
221 mtime: i64,
222 ctime: Option<i64>,
223 symlink_target: Option<Vec<u8>>,
224 archive_offset: u64,
225 stored_size: u64,
226 compression: u8,
227 /// Stored per-file data CRC (raw bytes); `None` when the format records none
228 /// (edition 1) or the width is zero.
229 crc: Option<Vec<u8>>,
230}
231
232impl EntryRef {
233 /// Project the internal entry into the public [`DarEntry`] (one clone of the
234 /// owned path/target fields).
235 fn to_dar_entry(&self) -> DarEntry {
236 DarEntry {
237 path: self.path.clone(),
238 kind: self.kind,
239 size: self.size,
240 uid: self.uid,
241 gid: self.gid,
242 mode: self.mode,
243 atime: self.atime,
244 mtime: self.mtime,
245 ctime: self.ctime,
246 symlink_target: self.symlink_target.clone(),
247 }
248 }
249}
250
251/// Read-only DAR archive reader.
252pub struct DarReader<R: Read + Seek> {
253 inner: R,
254 /// Byte position immediately after the slice header TLV block.
255 /// `archive_origin + archive_offset` = absolute position of raw file bytes.
256 archive_origin: u64,
257 /// Archive format major version (`value() >> 8`). Format 1 stores no
258 /// per-entry `storage_size`, so a compressed format-1 entry is decoded by
259 /// streaming the codec to its natural end rather than reading a fixed length.
260 format_major: u32,
261 /// Whether the catalog parsed to a clean root EOD (see [`DarReader::is_complete`]).
262 complete: bool,
263 /// Uncompressed block size from the header (`FLAG_HAS_COMPRESS_BS`); non-zero
264 /// means the archive uses dar's per-block compression framing, zero means a
265 /// single codec stream. Governs both the catalogue and every entry.
266 compr_bs: u64,
267 entries: Vec<EntryRef>,
268}
269
270impl<R: Read + Seek> DarReader<R> {
271 // The archive/slice-header parser is one cohesive state machine; splitting
272 // it would scatter the format logic across helpers and hurt readability.
273 #[allow(clippy::too_many_lines)]
274 pub fn open(mut reader: R) -> Result<Self, DarError> {
275 let mut magic = [0u8; 4];
276 reader
277 .read_exact(&mut magic)
278 .map_err(|_| DarError::NotADar)?;
279 if magic != DAR_MAGIC {
280 return Err(DarError::NotADar);
281 }
282
283 let mut label = [0u8; 10];
284 reader.read_exact(&mut label)?; // internal_name label
285 let _flag = read_u8(&mut reader)?; // slice flag ('T' terminal / 'N' / 'E')
286 let extension = read_u8(&mut reader)?; // 'T' = TLV (format 8+); 'N'/'S' = legacy (<= 7)
287
288 // Format 8+ carries a TLV list and a `seqt_catalogue` escape; format <= 7
289 // has neither — its catalogue is located via the end `terminateur` trailer
290 // (libdar header.cpp extension handling; terminateur.cpp).
291 let entries;
292 let archive_origin;
293 let format_major;
294 let complete;
295 let compr_bs;
296 if extension == b'T' {
297 // TLV list: infinint(count) then count × (u16 type + infinint len + data)
298 let tlv_count = read_infinint(&mut reader).map_err(|e| match e {
299 DarError::Io(_) => DarError::Corrupt("truncated TLV block".into()),
300 other => other,
301 })?;
302 // The archive's data_name (TLV type 0x0003, a 10-byte label) is the
303 // identity the catalogue's ref_data_name points at. It is preserved
304 // when an archive is re-sliced (dar_xform) even though the slice's own
305 // internal_name changes, so it — not the slice label — locates a
306 // tape-marks-off catalogue. For a normally-created archive the two are
307 // identical, so this is a no-op there.
308 let mut data_name: Option<[u8; 10]> = None;
309 for _ in 0..tlv_count {
310 let mut typ = [0u8; 2];
311 reader.read_exact(&mut typ)?;
312 let len = read_infinint(&mut reader)?;
313 if typ == [0x00, 0x03] && len == 10 {
314 let mut dn = [0u8; 10];
315 reader.read_exact(&mut dn)?;
316 data_name = Some(dn);
317 } else {
318 skip(&mut reader, len)?;
319 }
320 }
321
322 archive_origin = reader.stream_position()?;
323 let format_value = read_format_value(&mut reader);
324 // The archive's global compression algorithm is the byte immediately
325 // after the version string; it tells us whether (and how) the
326 // catalogue stream is compressed. Unreadable → treat as stored.
327 let global_comp = read_u8(&mut reader).unwrap_or(b'n');
328 // The cursor now sits at the command-line string; read on to the
329 // compression block size (zero = single-stream, non-zero = per-block).
330 compr_bs = read_compr_bs(&mut reader, format_value >> 8);
331 reader.seek(SeekFrom::Start(archive_origin))?;
332
333 // true → seqt_catalogue tape mark found (catalog has label + maybe path);
334 // false → located by its ref_data_name label (tape marks off, e.g. Passware).
335 let via_escape = find_catalogue(&mut reader, data_name.as_ref().unwrap_or(&label))?;
336 format_major = format_value >> 8;
337 if via_escape && is_compressed(global_comp) {
338 // The catalogue is a single stream compressed with the archive
339 // codec, beginning right after the seqt_catalogue escape and
340 // running to the trailer. Inflate it, then parse from the
341 // plaintext buffer — which begins with the in-catalog label and
342 // optional in-place path, exactly like the uncompressed case.
343 let mut compressed = Vec::new();
344 reader
345 .by_ref()
346 .take(MAX_CATALOGUE_COMPRESSED)
347 .read_to_end(&mut compressed)?;
348 let inflated = inflate_catalogue(&compressed, global_comp, compr_bs)?;
349 let mut cur = Cursor::new(inflated);
350 skip(&mut cur, 10)?; // catalog label
351 if format_value >= FORMAT_11_1 {
352 skip_nul_string(&mut cur)?;
353 }
354 (entries, complete) = parse_catalog(&mut cur, format_major, global_comp)?;
355 } else {
356 // The catalogue opens with a 10-byte label and, from format 11.1,
357 // an in-place path NUL-string before the entries. When located by
358 // the seqt_catalogue escape the reader sits before the label; when
359 // located by ref_data_name match (tape marks off) scan_window has
360 // already consumed the matched label, so only the path remains.
361 if via_escape {
362 skip(&mut reader, 10)?; // catalog label
363 }
364 if format_value >= FORMAT_11_1 {
365 skip_nul_string(&mut reader)?;
366 }
367 (entries, complete) = parse_catalog(&mut reader, format_major, global_comp)?;
368 }
369 } else if extension == b'N' || extension == b'S' {
370 // Legacy editions (<= 7) predate block compression — always a stream.
371 compr_bs = 0;
372 if extension == b'S' {
373 read_infinint(&mut reader)?; // slice size (multi-slice header); unused
374 }
375 archive_origin = reader.stream_position()?;
376 let format_value = read_format_value(&mut reader); // 3-byte edition: value = major*256
377 format_major = format_value >> 8;
378 // The global compression char follows the version string (same as
379 // format 8+). Formats <= 7 carry no per-entry compression byte, so
380 // this single char governs both the catalogue and every entry's data.
381 let global_comp = read_u8(&mut reader).unwrap_or(b'n');
382 let cat_offset = read_terminateur(&mut reader)?;
383 let cat_start = archive_origin
384 .checked_add(cat_offset)
385 .ok_or_else(|| DarError::Corrupt("catalogue offset overflows".into()))?;
386 let end = reader.seek(SeekFrom::End(0))?;
387 if cat_start >= end {
388 return Err(DarError::Corrupt(format!(
389 "catalogue start {cat_start} past archive end {end}"
390 )));
391 }
392 reader.seek(SeekFrom::Start(cat_start))?;
393 // Legacy catalogue: no 10-byte label, no path — entries begin here.
394 // When the archive is compressed, the catalogue is a single codec
395 // stream (the terminateur addresses its start); inflate it first.
396 if is_compressed(global_comp) {
397 let mut compressed = Vec::new();
398 reader
399 .by_ref()
400 .take(MAX_CATALOGUE_COMPRESSED)
401 .read_to_end(&mut compressed)?;
402 let inflated = inflate_catalogue(&compressed, global_comp, compr_bs)?;
403 (entries, complete) =
404 parse_catalog(&mut Cursor::new(inflated), format_major, global_comp)?;
405 } else {
406 (entries, complete) = parse_catalog(&mut reader, format_major, global_comp)?;
407 }
408 } else {
409 return Err(DarError::Corrupt(format!(
410 "unknown slice-header extension {extension:#04x}"
411 )));
412 }
413
414 Ok(Self {
415 inner: reader,
416 archive_origin,
417 format_major,
418 complete,
419 compr_bs,
420 entries,
421 })
422 }
423
424 /// Number of catalogue entries, in O(1) — without materialising or cloning
425 /// the entry list (cheap even for a multi-hundred-thousand-entry archive).
426 #[must_use]
427 pub fn entry_count(&self) -> usize {
428 self.entries.len()
429 }
430
431 /// Iterate the catalogue entries lazily, cloning one [`DarEntry`] at a time
432 /// rather than allocating the whole `Vec` up front — for streaming over a
433 /// large archive (hashing, timelining, filtering) without holding every
434 /// entry in memory at once. Use [`entries`](Self::entries) when you want them
435 /// all collected.
436 pub fn iter_entries(&self) -> impl Iterator<Item = DarEntry> + '_ {
437 self.entries.iter().map(EntryRef::to_dar_entry)
438 }
439
440 /// List all archived file entries (path and uncompressed size).
441 pub fn entries(&self) -> Vec<DarEntry> {
442 self.iter_entries().collect()
443 }
444
445 /// Whether the catalog was parsed to a clean end.
446 ///
447 /// `false` means parsing stopped early — typically at a catalog entry type
448 /// this reader does not model (e.g. a hardlink or device node) or at
449 /// corruption — so [`entries`](Self::entries) may be an *incomplete* listing.
450 /// A forensic caller should treat an incomplete listing as "more may exist".
451 #[must_use]
452 pub fn is_complete(&self) -> bool {
453 self.complete
454 }
455
456 /// Verify a file entry's data against the CRC stored in the catalogue,
457 /// decompressing the entry as needed. Returns [`CrcStatus::Match`],
458 /// [`CrcStatus::Mismatch`], or [`CrcStatus::NotStored`]. Unlike a
459 /// verify-on-extract design, this never refuses to hand over the bytes —
460 /// a forensic caller can still [`extract`](Self::extract) data that fails
461 /// its CRC in order to examine the corruption.
462 pub fn verify<P: AsRef<[u8]>>(&mut self, path: P) -> Result<CrcStatus, DarError> {
463 let path = path.as_ref();
464 let stored = self
465 .entries
466 .iter()
467 .find(|e| e.path.as_slice() == path)
468 .ok_or_else(|| DarError::EntryNotFound(String::from_utf8_lossy(path).into_owned()))?
469 .crc
470 .clone();
471 let Some(stored) = stored else {
472 return Ok(CrcStatus::NotStored);
473 };
474 // The CRC covers the plaintext, so verify against the decompressed data.
475 let data = self.extract(path)?;
476 let computed = dar_crc(&data, stored.len());
477 if computed == stored {
478 Ok(CrcStatus::Match)
479 } else {
480 Ok(CrcStatus::Mismatch {
481 stored: to_hex(&stored),
482 computed: to_hex(&computed),
483 })
484 }
485 }
486
487 /// Extract a file by path, streaming its (decompressed) bytes to `out` and
488 /// returning the number of bytes written. Unlike [`extract`](Self::extract),
489 /// this never holds the whole file in memory, so it is safe for multi-GiB
490 /// entries (and composes with hashing, scanning, or writing to disk).
491 pub fn extract_to<P: AsRef<[u8]>, W: Write>(
492 &mut self,
493 path: P,
494 out: &mut W,
495 ) -> Result<u64, DarError> {
496 let path = path.as_ref();
497 let name = String::from_utf8_lossy(path);
498 let entry = self
499 .entries
500 .iter()
501 .find(|e| e.path.as_slice() == path)
502 .ok_or_else(|| DarError::EntryNotFound(name.clone().into_owned()))?
503 .clone();
504
505 // The raw bytes live at archive_origin + archive_offset. Both fields are
506 // attacker-controlled, so the sum is checked and the claimed length
507 // validated against the bytes that actually exist before reading.
508 let start = self
509 .archive_origin
510 .checked_add(entry.archive_offset)
511 .ok_or_else(|| {
512 DarError::Corrupt(format!("'{name}' archive offset overflows file position"))
513 })?;
514 let end = self.inner.seek(SeekFrom::End(0))?;
515 if start > end {
516 return Err(DarError::Corrupt(format!(
517 "'{name}' starts at {start}, past archive end {end}"
518 )));
519 }
520 let available = end - start;
521 self.inner.seek(SeekFrom::Start(start))?;
522
523 // Stored: stream the raw bytes straight through, no buffering.
524 if !is_compressed(entry.compression) {
525 if entry.stored_size > available {
526 return Err(DarError::Corrupt(format!(
527 "'{name}' claims {} stored bytes but only {available} remain",
528 entry.stored_size
529 )));
530 }
531 return Ok(std::io::copy(
532 &mut self.inner.by_ref().take(entry.stored_size),
533 out,
534 )?);
535 }
536
537 // Compressed: decode straight to `out`, capped at the declared size so a
538 // forged stream cannot over-inflate (streaming decompression-bomb guard).
539 let mut cap = CapWriter {
540 inner: out,
541 written: 0,
542 max: entry.size,
543 };
544 if self.format_major == 1 {
545 // Format 1 stores no storage_size; the codec stream (dar 1.x is
546 // gzip/zlib-only) runs from the offset to its own natural end.
547 decode_stream(self.inner.by_ref(), entry.compression, &mut cap)?;
548 } else {
549 // 8+/2-7: exactly stored_size compressed bytes on disk.
550 if entry.stored_size > available {
551 return Err(DarError::Corrupt(format!(
552 "'{name}' claims {} stored bytes but only {available} remain",
553 entry.stored_size
554 )));
555 }
556 let mut data = vec![0u8; entry.stored_size as usize];
557 self.inner.read_exact(&mut data)?;
558 decode_data(&data[..], entry.compression, self.compr_bs, &mut cap)?;
559 }
560 if cap.written != entry.size {
561 return Err(DarError::Corrupt(format!(
562 "'{name}' decompressed to {} bytes but catalog declares {}",
563 cap.written, entry.size
564 )));
565 }
566 Ok(cap.written)
567 }
568
569 /// Extract a file by path, returning its raw bytes. Buffers the whole entry
570 /// in memory; prefer [`extract_to`](Self::extract_to) for large files.
571 pub fn extract<P: AsRef<[u8]>>(&mut self, path: P) -> Result<Vec<u8>, DarError> {
572 let mut buf = Vec::new();
573 self.extract_to(path, &mut buf)?;
574 Ok(buf)
575 }
576}
577
578// ── Catalog parser ────────────────────────────────────────────────────────────
579
580/// On archives larger than this, the catalog scan starts this many bytes
581/// before EOF (the catalog always lives at the tail), avoiding a full read of
582/// a multi-gigabyte forensic archive before falling back to a full scan.
583const TAIL_SCAN: u64 = 256 * 1024 * 1024;
584
585const CHUNK: usize = 4 * 1024 * 1024;
586// OVERLAP = max(SEQT_CATALOGUE.len(), label.len()) - 1; carries bytes across chunk boundaries.
587const OVERLAP: usize = 9;
588
589/// Scan forward from the current reader position searching for either the
590/// `seqt_catalogue` escape or the archive `label`.
591///
592/// Returns `Some(true)` if the escape was found (reader positioned just after it),
593/// `Some(false)` if the label was found (reader positioned just after it),
594/// `None` if EOF was reached without a match.
595fn scan_window<R: Read + Seek>(
596 r: &mut R,
597 label: &[u8; 10],
598 use_label: bool,
599) -> Result<Option<bool>, DarError> {
600 let mut buf = vec![0u8; CHUNK + OVERLAP];
601 let mut overlap_len: usize = 0;
602 loop {
603 let chunk_file_pos = r.stream_position()?;
604 let n = r.read(&mut buf[overlap_len..overlap_len + CHUNK])?;
605 if n == 0 {
606 break;
607 }
608 let total = overlap_len + n;
609 // buf[0..overlap_len] → tail of previous chunk (file pos: chunk_file_pos - overlap_len)
610 // buf[overlap_len..total] → newly read bytes
611 let buf_base = chunk_file_pos - overlap_len as u64;
612
613 if let Some(i) = buf[..total]
614 .windows(SEQT_CATALOGUE.len())
615 .position(|w| w == SEQT_CATALOGUE)
616 {
617 r.seek(SeekFrom::Start(
618 buf_base + i as u64 + SEQT_CATALOGUE.len() as u64,
619 ))?;
620 return Ok(Some(true));
621 }
622 if use_label {
623 if let Some(i) = buf[..total]
624 .windows(label.len())
625 .position(|w| w == label.as_ref())
626 {
627 r.seek(SeekFrom::Start(buf_base + i as u64 + label.len() as u64))?;
628 return Ok(Some(false));
629 }
630 }
631
632 let keep = OVERLAP.min(total);
633 buf.copy_within(total - keep..total, 0);
634 overlap_len = keep;
635 }
636 Ok(None)
637}
638
639/// Locate the catalog section and position the reader at its first entry.
640///
641/// Returns `true` when the `seqt_catalogue` escape is found — the caller then
642/// skips the 10-byte in-catalog label and (format 11.1+) the path NUL string.
643/// The escape is a *sequential-read tape mark*; it is present only when the
644/// archive was written with tape marks (libdar's default).
645///
646/// Returns `false` when the catalog is located by its `ref_data_name` label
647/// directly. Archives written with tape marks disabled (e.g. by Passware Kit
648/// Mobile, equivalent to `dar -at`) omit the escape; their catalog still begins
649/// with the 10-byte `ref_data_name`, which equals the slice `label`, so scanning
650/// for `label` in the tail finds it — a structural marker, not a heuristic.
651///
652/// Returns `Err(Corrupt)` when neither marker is found.
653///
654/// Strategy: DAR catalogs always live at the tail of the archive. On forensic
655/// archives ≥ 256 MiB we jump straight to the last 256 MiB and scan forward
656/// from there, then fall back to a full forward scan from `archive_origin` if
657/// needed. This reduces the I/O for a 92 GiB archive from ~99 GiB to ~107 MiB.
658fn find_catalogue<R: Read + Seek>(r: &mut R, label: &[u8; 10]) -> Result<bool, DarError> {
659 find_catalogue_within(r, label, TAIL_SCAN)
660}
661
662/// Implementation of [`find_catalogue`] with the tail-scan window size as a
663/// parameter so the full-scan fallback can be exercised without a 256 MiB
664/// fixture.
665fn find_catalogue_within<R: Read + Seek>(
666 r: &mut R,
667 label: &[u8; 10],
668 tail_scan: u64,
669) -> Result<bool, DarError> {
670 // All-zero labels cannot be used as a reliable catalog marker (too common
671 // in zero-padded archive bodies).
672 let use_label = !label.iter().all(|&b| b == 0);
673
674 let archive_origin = r.stream_position()?;
675 let file_end = r.seek(SeekFrom::End(0))?;
676
677 if file_end <= archive_origin {
678 return Err(DarError::Corrupt("archive body too short".into()));
679 }
680
681 // Jump to at most tail_scan bytes before end; for small files this equals archive_origin.
682 let tail_start = archive_origin.max(file_end.saturating_sub(tail_scan));
683 r.seek(SeekFrom::Start(tail_start))?;
684
685 if let Some(result) = scan_window(r, label, use_label)? {
686 return Ok(result);
687 }
688
689 // Tail scan missed. Fall back to a full scan from archive_origin.
690 if tail_start > archive_origin {
691 r.seek(SeekFrom::Start(archive_origin))?;
692 if let Some(result) = scan_window(r, label, use_label)? {
693 return Ok(result);
694 }
695 }
696
697 Err(DarError::Corrupt("seqt_catalogue not found".into()))
698}
699
700/// The byte length of one slice's header (`magic + label + flag + extension +
701/// optional TLV / slice-size`). Every slice of a multi-volume archive begins
702/// with this header; slice 1's header is the archive's own slice header, while
703/// later slices' headers are stripped so only their data regions join the
704/// logical stream. Mirrors the header prefix parsed by [`DarReader::open`].
705fn slice_header_len<R: Read + Seek>(r: &mut R) -> Result<u64, DarError> {
706 let mut magic = [0u8; 4];
707 r.read_exact(&mut magic).map_err(|_| DarError::NotADar)?;
708 if magic != DAR_MAGIC {
709 return Err(DarError::NotADar);
710 }
711 skip(r, 10)?; // internal_name label
712 let _flag = read_u8(r)?;
713 match read_u8(r)? {
714 b'T' => {
715 // TLV list: infinint(count) then count × (u16 type + infinint len + data).
716 let tlv_count = read_infinint(r)?;
717 for _ in 0..tlv_count {
718 skip(r, 2)?;
719 let len = read_infinint(r)?;
720 skip(r, len)?;
721 }
722 }
723 b'N' => {}
724 b'S' => {
725 read_infinint(r)?; // legacy slice-size field
726 }
727 other => {
728 return Err(DarError::Corrupt(format!(
729 "unknown slice-header extension {other:#04x}"
730 )));
731 }
732 }
733 Ok(r.stream_position()?)
734}
735
736/// One slice's contribution to the logical archive stream.
737struct SliceSpan {
738 file: File,
739 /// Byte offset within the slice file where this slice's contributed data
740 /// begins — 0 for slice 1 (its header is kept), the header length otherwise.
741 file_data_start: u64,
742 /// Where this slice begins in the logical (de-sliced) stream.
743 logical_start: u64,
744 /// Number of logical bytes this slice contributes.
745 logical_len: u64,
746}
747
748/// A `Read + Seek` view over a multi-volume DAR archive (`base.1.dar`,
749/// `base.2.dar`, …) presenting the slices as one contiguous logical stream:
750/// slice 1 in full (its header is the archive's slice header) followed by every
751/// later slice with its own slice header stripped. This is byte-identical to the
752/// equivalent unsliced archive, so the catalogue and per-entry offsets resolve
753/// across slice boundaries with no other change to the reader.
754pub struct SliceReader {
755 slices: Vec<SliceSpan>,
756 pos: u64,
757 total: u64,
758}
759
760impl SliceReader {
761 /// Build the logical stream from an explicit, ordered list of slice files
762 /// (`base.1.dar`, `base.2.dar`, …); the first path is slice 1.
763 pub fn open(paths: &[PathBuf]) -> Result<Self, DarError> {
764 if paths.is_empty() {
765 return Err(DarError::Corrupt("no slices provided".into()));
766 }
767 let mut slices = Vec::with_capacity(paths.len());
768 let mut logical_start = 0u64;
769 for (i, path) in paths.iter().enumerate() {
770 let mut file = File::open(path)?;
771 let len = file.seek(SeekFrom::End(0))?;
772 file.seek(SeekFrom::Start(0))?;
773 let file_data_start = if i == 0 {
774 0
775 } else {
776 slice_header_len(&mut file)?
777 };
778 // libdar's SAR layer ends every slice with a 1-byte flag ('N' = a slice
779 // follows, 'T' = terminal). On a non-terminal slice that flag sits in
780 // the middle of the file data and must be dropped; the terminal slice's
781 // flag is the archive's own final byte and is kept — so the logical
782 // stream ends byte-identically to an unsliced archive and the
783 // end-relative terminateur (tape-marks-off catalogues) still resolves.
784 let trailer = u64::from(i + 1 < paths.len());
785 if len < file_data_start + trailer {
786 return Err(DarError::Corrupt(
787 "slice smaller than its header + flag".into(),
788 ));
789 }
790 let logical_len = len - file_data_start - trailer;
791 slices.push(SliceSpan {
792 file,
793 file_data_start,
794 logical_start,
795 logical_len,
796 });
797 logical_start = logical_start
798 .checked_add(logical_len)
799 .ok_or_else(|| DarError::Corrupt("total slice length overflows".into()))?;
800 }
801 Ok(Self {
802 slices,
803 pos: 0,
804 total: logical_start,
805 })
806 }
807}
808
809impl Read for SliceReader {
810 fn read(&mut self, buf: &mut [u8]) -> std::io::Result<usize> {
811 // Fill `buf` across slice boundaries — only stopping short at end-of-archive
812 // or an underlying short read — so callers that issue a single `read()` and
813 // assume a full buffer (as they may for an in-memory `Cursor`) behave
814 // identically over a sliced archive.
815 let mut written = 0;
816 while written < buf.len() {
817 let pos = self.pos;
818 // The first slice whose data extends past `pos` contains it (slices are
819 // contiguous from 0); no such slice means end-of-archive.
820 let Some(idx) = self
821 .slices
822 .iter()
823 .position(|s| pos < s.logical_start + s.logical_len)
824 else {
825 break;
826 };
827 let n = {
828 let span = &mut self.slices[idx];
829 let within = pos - span.logical_start;
830 let want = (buf.len() - written).min((span.logical_len - within) as usize);
831 span.file
832 .seek(SeekFrom::Start(span.file_data_start + within))?;
833 span.file.read(&mut buf[written..written + want])?
834 };
835 if n == 0 {
836 break; // truncated slice: stop, do not spin
837 }
838 self.pos += n as u64;
839 written += n;
840 }
841 Ok(written)
842 }
843}
844
845impl Seek for SliceReader {
846 fn seek(&mut self, from: SeekFrom) -> std::io::Result<u64> {
847 let target: i128 = match from {
848 SeekFrom::Start(n) => i128::from(n),
849 SeekFrom::End(n) => i128::from(self.total) + i128::from(n),
850 SeekFrom::Current(n) => i128::from(self.pos) + i128::from(n),
851 };
852 if target < 0 {
853 return Err(std::io::Error::new(
854 std::io::ErrorKind::InvalidInput,
855 "seek before start of archive",
856 ));
857 }
858 self.pos = target as u64;
859 Ok(self.pos)
860 }
861}
862
863impl DarReader<SliceReader> {
864 /// Open a multi-volume (sliced) archive from its basename: `base` resolves
865 /// `base.1.dar`, `base.2.dar`, … until a slice is missing. The catalogue
866 /// lives in the last slice and entry data may span slices — both are handled
867 /// transparently. Errors if no `base.1.dar` exists.
868 pub fn open_slices(basename: &Path) -> Result<Self, DarError> {
869 let parent = basename
870 .parent()
871 .filter(|p| !p.as_os_str().is_empty())
872 .unwrap_or_else(|| Path::new("."));
873 let stem = basename
874 .file_name()
875 .and_then(|s| s.to_str())
876 .ok_or_else(|| DarError::Corrupt("invalid slice basename".into()))?;
877 let mut paths = Vec::new();
878 let mut n = 1u64;
879 loop {
880 let p = parent.join(format!("{stem}.{n}.dar"));
881 if !p.exists() {
882 break;
883 }
884 paths.push(p);
885 n += 1;
886 }
887 if paths.is_empty() {
888 return Err(DarError::Corrupt(format!(
889 "no slices found for basename {}",
890 basename.display()
891 )));
892 }
893 DarReader::open(SliceReader::open(&paths)?)
894 }
895}
896
897/// Read the NUL-terminated `version_string` at the current position and return
898/// `archive_version::value()` = `major*256 + fix`, where `major = b0*256 + b1`
899/// and each byte is `value + 48`. Format <= 7 stores only `"NN"` (fix implicitly
900/// 0); format 8+ stores `"NNf"`. Returns `u32::MAX` for an unreadable string so
901/// an unknown future format is treated as newest.
902fn read_format_value<R: Read>(r: &mut R) -> u32 {
903 let b = read_nul_bytes(r).unwrap_or_default();
904 if b.len() >= 2 {
905 let major = u32::from(b[0].saturating_sub(48)) * 256 + u32::from(b[1].saturating_sub(48));
906 let fix = if b.len() >= 3 {
907 u32::from(b[2].saturating_sub(48))
908 } else {
909 0
910 };
911 major * 256 + fix
912 } else {
913 u32::MAX
914 }
915}
916
917/// Read the multi-byte header flag field (libdar header_flags.cpp): bytes are
918/// accumulated most-significant-first, the low bit (`0x01`) of each byte signals
919/// that another byte follows, and the value bits are `byte & 0xFE`.
920fn read_header_flags<R: Read>(r: &mut R) -> Result<u64, DarError> {
921 let mut bits: u64 = 0;
922 loop {
923 let a = read_u8(r)?;
924 if bits >> 56 != 0 {
925 return Err(DarError::Corrupt("header flag field too large".into()));
926 }
927 bits = (bits << 8) | u64::from(a & 0xFE);
928 if a & 0x01 == 0 {
929 return Ok(bits);
930 }
931 }
932}
933
934/// Read the compression block size from the archive header (cursor positioned
935/// just after the global compression byte). A non-zero result selects dar's
936/// per-block decompression; 0 means a single codec stream.
937///
938/// Returns 0 for edition 1 (no flags), when no block size is recorded, when the
939/// value is implausibly large ([`MAX_BLOCK_SIZE`]), or when the header carries
940/// fields this reader does not parse (encryption / KDF / isolated-catalogue
941/// slicing — none of which are decodable anyway). Best-effort: a read error also
942/// degrades to 0, so a genuinely block-framed stream then fails loudly at the
943/// decode step rather than being silently mis-read. Existing single-stream
944/// archives are unaffected — they have no block size and resolve to 0.
945fn read_compr_bs<R: Read>(r: &mut R, format_major: u32) -> u64 {
946 fn inner<R: Read>(r: &mut R, format_major: u32) -> Result<u64, DarError> {
947 const INITIAL_OFFSET: u64 = 0x08;
948 const HAS_COMPRESS_BS: u64 = 0x0800;
949 // Fields sitting between the flags and the block size that this reader
950 // does not parse; archives that set them (encrypted / KDF / isolated
951 // catalogue) are not decodable regardless.
952 const COMPLEX: u64 = 0x20 | 0x04 | 0x02 | 0x0400; // scrambled | crypted-key | ref-slicing | kdf
953
954 skip_nul_string(r)?; // command line
955 if format_major < 2 {
956 return Ok(0); // the flag field was introduced at edition 2
957 }
958 let flags = read_header_flags(r)?;
959 if flags & COMPLEX != 0 || flags & HAS_COMPRESS_BS == 0 {
960 return Ok(0);
961 }
962 if flags & INITIAL_OFFSET != 0 {
963 read_infinint(r)?; // skip the initial offset
964 }
965 let bs = read_infinint(r)?;
966 Ok(if bs > MAX_BLOCK_SIZE { 0 } else { bs })
967 }
968 inner(r, format_major).unwrap_or(0)
969}
970
971/// True when a libdar compression char names a known compression algorithm.
972/// `compression2char` emits the algorithm letter in lowercase for streamed mode
973/// and uppercase for per-block mode (`z`=gzip, `y`=bzip2, `x`=xz, `l`/`j`/`k`=lzo
974/// variants, `d`=zstd, `q`=lz4); `n` is stored. Any other byte — e.g. a header
975/// placeholder in a non-dar-produced archive — is treated as not compressed, so
976/// the catalogue/entry is read verbatim rather than mis-decoded.
977fn is_compressed(algo: u8) -> bool {
978 matches!(
979 algo.to_ascii_lowercase(),
980 b'z' | b'y' | b'x' | b'l' | b'j' | b'k' | b'd' | b'q'
981 )
982}
983
984/// Inflate a compressed catalogue into a single buffer, routing through the same
985/// [`decode_stream`]/[`CapWriter`] path the per-file extractor uses and capping
986/// output at `MAX_CATALOGUE_INFLATED` (decompression-bomb guard). Trailing bytes
987/// after the codec stream (the archive trailer) are ignored by the decoder.
988fn inflate_catalogue(compressed: &[u8], algo: u8, block_size: u64) -> Result<Vec<u8>, DarError> {
989 let mut out = Vec::new();
990 let mut cap = CapWriter {
991 inner: &mut out,
992 written: 0,
993 max: MAX_CATALOGUE_INFLATED,
994 };
995 decode_data(compressed, algo, block_size, &mut cap)?;
996 Ok(out)
997}
998
999/// Decode a compressed data span. The archive uses dar's per-block framing (see
1000/// [`decode_blocks`]) when a block size is recorded (`block_size > 0`) or the
1001/// codec is lz4/lzo — which have no streamed form and so are always block-framed
1002/// (dar applies a default block size that it does not store in the header).
1003/// Otherwise it is a single codec stream (see [`decode_stream`]).
1004fn decode_data<W: Write>(
1005 data: &[u8],
1006 algo: u8,
1007 block_size: u64,
1008 out: &mut W,
1009) -> Result<(), DarError> {
1010 let always_block = matches!(algo.to_ascii_lowercase(), b'q' | b'l' | b'j' | b'k');
1011 if block_size > 0 || always_block {
1012 decode_blocks(data, algo, block_size, out)
1013 } else {
1014 decode_stream(data, algo, out)
1015 }
1016}
1017
1018/// Decode a dar `block_compressor` stream: a sequence of blocks, each
1019/// `[type: 1 byte][infinint compressed_size][compressed_size bytes]`, terminated
1020/// by an `H_EOF` block (size 0). Each `H_DATA` block is decompressed
1021/// independently and appended to `out` (libdar block_compressor.cpp /
1022/// compress_block_header.cpp).
1023///
1024/// For lz4 each block is a raw LZ4 block decoded into a `block_size`-byte buffer;
1025/// for the other codecs each block is a complete, self-delimiting codec stream
1026/// decoded via [`decode_stream`]. `block_size` is the archive's uncompressed
1027/// block size (the lz4 destination capacity). Each block's compressed size is
1028/// bounded by the remaining input, which also bounds the loop to O(input)
1029/// iterations.
1030fn decode_blocks<W: Write>(
1031 data: &[u8],
1032 algo: u8,
1033 block_size: u64,
1034 out: &mut W,
1035) -> Result<(), DarError> {
1036 const H_DATA: u8 = 1;
1037 const H_EOF: u8 = 2;
1038
1039 let mut input = data;
1040 // Reusable destination buffer for the raw block codecs (lz4, lzo): their
1041 // blocks carry no uncompressed size, so each decodes into a buffer seeded to
1042 // the declared block size, or to cover dar's default (240 KiB) when the
1043 // archive records none — a block that overflows it is genuine corruption,
1044 // surfaced as a decode error rather than silently grown.
1045 let mut raw_block_buf: Vec<u8> =
1046 if matches!(algo.to_ascii_lowercase(), b'q' | b'l' | b'j' | b'k') {
1047 let seed = if block_size > 0 {
1048 block_size.min(MAX_BLOCK_SIZE) as usize
1049 } else {
1050 256 * 1024
1051 };
1052 vec![0u8; seed]
1053 } else {
1054 Vec::new()
1055 };
1056
1057 loop {
1058 let typ = read_u8(&mut input)
1059 .map_err(|_| DarError::Corrupt("truncated block stream: missing end marker".into()))?;
1060 let size = read_infinint(&mut input)?;
1061 match typ {
1062 H_EOF => {
1063 if size != 0 {
1064 return Err(DarError::Corrupt(
1065 "non-zero size on end-of-blocks marker".into(),
1066 ));
1067 }
1068 return Ok(());
1069 }
1070 H_DATA => {
1071 if size == 0 {
1072 return Err(DarError::Corrupt("zero-size compressed block".into()));
1073 }
1074 // A block cannot be larger than the bytes that remain in the
1075 // (already bounded) input. This both caps the allocation and,
1076 // since every block consumes at least its `size` bytes, bounds
1077 // the loop to O(input) iterations — no separate block-count cap.
1078 if size > input.len() as u64 {
1079 return Err(DarError::Corrupt(
1080 "compressed block size exceeds remaining input".into(),
1081 ));
1082 }
1083 let mut block = vec![0u8; size as usize];
1084 input
1085 .read_exact(&mut block)
1086 .map_err(|_| DarError::Corrupt("truncated compressed block".into()))?;
1087 match algo.to_ascii_lowercase() {
1088 b'q' => decode_lz4_block(&block, &mut raw_block_buf, out)?,
1089 b'l' | b'j' | b'k' => decode_lzo_block(&block, &mut raw_block_buf, out)?,
1090 // gzip/bzip2/xz/zstd block = a complete self-delimiting stream.
1091 _ => decode_stream(&block[..], algo, out)?,
1092 }
1093 }
1094 other => {
1095 return Err(DarError::Corrupt(format!(
1096 "unknown compressed block type {other}"
1097 )));
1098 }
1099 }
1100 }
1101}
1102
1103/// Decompress one raw lz4 block into `out` using `buf` (sized to the block size)
1104/// as the destination. A block that does not fit (or is malformed) is a decode
1105/// error — dar never writes a block larger than the archive's block size.
1106fn decode_lz4_block<W: Write>(block: &[u8], buf: &mut [u8], out: &mut W) -> Result<(), DarError> {
1107 let n = lz4_flex::block::decompress_into(block, buf)
1108 .map_err(|e| DarError::Corrupt(format!("lz4 block decode failed: {e}")))?;
1109 out.write_all(&buf[..n])?;
1110 Ok(())
1111}
1112
1113/// Decompress one raw lzo1x block into `out` using `buf` (sized to the block
1114/// size) as the destination. A block that does not fit, or is not a valid lzo1x
1115/// block, is a decode error — dar never writes a block larger than the archive's
1116/// block size, and the [`lzo`] decoder is bounds-checked, so malformed input
1117/// surfaces as a typed error rather than a panic.
1118fn decode_lzo_block<W: Write>(block: &[u8], buf: &mut [u8], out: &mut W) -> Result<(), DarError> {
1119 let n = lzo::decompress_into(block, buf)
1120 .map_err(|e| DarError::Corrupt(format!("lzo block decode failed: {e}")))?;
1121 out.write_all(&buf[..n])?;
1122 Ok(())
1123}
1124
1125/// A `Write` adapter that forwards to `inner`, counting bytes written and failing
1126/// once more than `max` would be written — the streaming decompression-bomb
1127/// guard used by [`DarReader::extract_to`].
1128struct CapWriter<'a, W: Write> {
1129 inner: &'a mut W,
1130 written: u64,
1131 max: u64,
1132}
1133
1134impl<W: Write> Write for CapWriter<'_, W> {
1135 fn write(&mut self, data: &[u8]) -> std::io::Result<usize> {
1136 if self.written + data.len() as u64 > self.max {
1137 return Err(std::io::Error::other("decompressed data exceeds bound"));
1138 }
1139 self.inner.write_all(data)?;
1140 self.written += data.len() as u64;
1141 Ok(data.len())
1142 }
1143
1144 fn flush(&mut self) -> std::io::Result<()> {
1145 self.inner.flush()
1146 }
1147}
1148
1149/// Stream-decode a compressed input to `out`, dispatching on the libdar codec
1150/// char. The Read decoders stop at the codec stream's end (ignoring trailing
1151/// bytes); lzma-rs rejects trailing bytes only after fully validating the
1152/// stream, so that one error is treated as success.
1153fn decode_stream<R: Read, W: Write>(input: R, algo: u8, out: &mut W) -> Result<(), DarError> {
1154 match algo.to_ascii_lowercase() {
1155 b'z' => {
1156 std::io::copy(&mut flate2::read::ZlibDecoder::new(input), out)
1157 .map_err(|e| DarError::Corrupt(format!("zlib decode failed: {e}")))?;
1158 Ok(())
1159 }
1160 b'y' => {
1161 std::io::copy(&mut bzip2_rs::DecoderReader::new(input), out)
1162 .map_err(|e| DarError::Corrupt(format!("bzip2 decode failed: {e}")))?;
1163 Ok(())
1164 }
1165 b'x' => {
1166 let mut br = std::io::BufReader::new(input);
1167 match lzma_rs::xz_decompress(&mut br, out) {
1168 Ok(()) => {}
1169 Err(lzma_rs::error::Error::XzError(ref m))
1170 if m == "Unexpected data after last XZ block" => {}
1171 Err(e) => return Err(DarError::Corrupt(format!("xz decode failed: {e}"))),
1172 }
1173 Ok(())
1174 }
1175 b'd' => {
1176 // dar's streamed zstd is a standard zstd frame (ZSTD_compressStream).
1177 let mut dec = ruzstd::StreamingDecoder::new(input)
1178 .map_err(|e| DarError::Corrupt(format!("zstd decode failed: {e}")))?;
1179 std::io::copy(&mut dec, out)
1180 .map_err(|e| DarError::Corrupt(format!("zstd decode failed: {e}")))?;
1181 Ok(())
1182 }
1183 // An unrecognised codec char lands here — a clear error, never a silent
1184 // misread. (Single line so the e2e-coverage allowlist matches one specific line.)
1185 #[rustfmt::skip]
1186 other => Err(DarError::Corrupt(format!("unrecognised compression codec '{}'", other as char))),
1187 }
1188}
1189
1190/// Locate the catalogue in a pre-format-8 archive via the end `terminateur`
1191/// trailer (libdar terminateur.cpp:95-138), returning the catalogue start offset
1192/// relative to `archive_origin`.
1193///
1194/// From EOF, count trailing `0xFF` padding bytes (8 bits each); the first
1195/// non-`0xFF` byte encodes the remaining count in unary as its set high bits.
1196/// `byte_offset = total_bits * 4` is the distance back from that byte to the
1197/// catalogue-position infinint. The `0xFF` run is bounded so a hostile all-`0xFF`
1198/// tail cannot spin or overflow.
1199fn read_terminateur<R: Read + Seek>(r: &mut R) -> Result<u64, DarError> {
1200 const BLOCK_SIZE: u64 = 4;
1201 const MAX_BITS: u64 = 4096; // far beyond any real terminator
1202
1203 let mut pos = r.seek(SeekFrom::End(0))?;
1204 let mut bits: u64 = 0;
1205 let terminal = loop {
1206 if pos == 0 {
1207 return Err(DarError::Corrupt("terminator underflows archive".into()));
1208 }
1209 pos -= 1;
1210 r.seek(SeekFrom::Start(pos))?;
1211 let b = read_u8(r)?;
1212 if b == 0xFF {
1213 bits += 8;
1214 if bits > MAX_BITS {
1215 return Err(DarError::Corrupt("terminator padding too long".into()));
1216 }
1217 } else {
1218 break b;
1219 }
1220 };
1221 // The terminator byte must have its top bit set; count consecutive set MSBs.
1222 if terminal & 0x80 == 0 {
1223 return Err(DarError::Corrupt(format!(
1224 "invalid terminator byte {terminal:#04x}"
1225 )));
1226 }
1227 let mut x = terminal;
1228 while x != 0 {
1229 if x & 0x80 == 0 {
1230 return Err(DarError::Corrupt("malformed terminator bit run".into()));
1231 }
1232 bits += 1;
1233 x <<= 1;
1234 }
1235 let byte_offset = bits * BLOCK_SIZE;
1236 let infinint_start = pos
1237 .checked_sub(byte_offset)
1238 .ok_or_else(|| DarError::Corrupt("terminator offset underflows".into()))?;
1239 r.seek(SeekFrom::Start(infinint_start))?;
1240 read_infinint(r)
1241}
1242
1243/// Parse all catalog entries, returning file entries with their extraction info.
1244///
1245/// Stops when the root directory is closed (depth reaches zero) or an unknown
1246/// entry type is encountered (slice trailer).
1247fn parse_catalog<R: Read + Seek>(
1248 r: &mut R,
1249 format_major: u32,
1250 global_comp: u8,
1251) -> Result<(Vec<EntryRef>, bool), DarError> {
1252 let mut entries = Vec::new();
1253 let mut dir_stack: Vec<Vec<u8>> = Vec::new();
1254 let mut depth: u32 = 0;
1255 // True once the catalog is walked to its closing root EOD; left false if we
1256 // stop early (unknown entry type or a truncated stream).
1257 let mut complete = false;
1258
1259 loop {
1260 let mut buf = [0u8; 1];
1261 match r.read_exact(&mut buf) {
1262 Ok(()) => {}
1263 Err(_) => break,
1264 }
1265
1266 // Lower 5 bits of cat_sig + 0x60 gives the ASCII type letter.
1267 let entry_type = ((buf[0] & 0x1f) | 0x60) as char;
1268
1269 match entry_type {
1270 'z' => {
1271 // End of directory
1272 depth = depth.saturating_sub(1);
1273 dir_stack.pop();
1274 if depth == 0 {
1275 complete = true; // reached the closing root EOD — clean end
1276 break;
1277 }
1278 }
1279 'd' => {
1280 let name = read_nul_bytes(r)?;
1281 let inode = read_inode_base(r, format_major)?;
1282 if format_major >= 9 && (inode.flags >> 4) & 1 != 0 {
1283 skip_fsa(r)?;
1284 }
1285 let is_root = depth == 0;
1286 depth += 1;
1287 // The archive root (`<ROOT>`, or `"root"` in formats 1/9) is a
1288 // virtual node: `<ROOT>` is dropped entirely; a named root becomes
1289 // the path prefix. Neither is listed as an entry. Real
1290 // sub-directories are listed with their full path.
1291 if name != b"<ROOT>" {
1292 let path = join_path(&dir_stack, &name);
1293 if !is_root {
1294 entries.push(meta_entry(path, EntryKind::Directory, &inode, None));
1295 }
1296 dir_stack.push(name);
1297 }
1298 }
1299 'f' => {
1300 let name = read_nul_bytes(r)?;
1301 let inode = read_inode_base(r, format_major)?;
1302 if format_major >= 9 && (inode.flags >> 4) & 1 != 0 {
1303 skip_fsa(r)?;
1304 }
1305
1306 let FileFields {
1307 size,
1308 archive_offset,
1309 stored_size,
1310 compression,
1311 crc,
1312 } = read_file_fields(r, format_major, global_comp)?;
1313
1314 entries.push(EntryRef {
1315 path: join_path(&dir_stack, &name),
1316 kind: EntryKind::File,
1317 size,
1318 uid: inode.uid,
1319 gid: inode.gid,
1320 mode: inode.mode,
1321 atime: inode.atime,
1322 mtime: inode.mtime,
1323 ctime: inode.ctime,
1324 symlink_target: None,
1325 archive_offset,
1326 stored_size,
1327 compression,
1328 crc,
1329 });
1330 }
1331 'l' => {
1332 // Symbolic link: inode + NUL-terminated target path.
1333 let name = read_nul_bytes(r)?;
1334 let inode = read_inode_base(r, format_major)?;
1335 if format_major >= 9 && (inode.flags >> 4) & 1 != 0 {
1336 skip_fsa(r)?;
1337 }
1338 let target = read_nul_bytes(r)?;
1339 let path = join_path(&dir_stack, &name);
1340 entries.push(meta_entry(path, EntryKind::Symlink, &inode, Some(target)));
1341 }
1342 'p' | 's' => {
1343 // Named pipe (FIFO) / unix socket: a bare inode, no data and no
1344 // type-specific fields.
1345 let name = read_nul_bytes(r)?;
1346 let inode = read_inode_base(r, format_major)?;
1347 if format_major >= 9 && (inode.flags >> 4) & 1 != 0 {
1348 skip_fsa(r)?;
1349 }
1350 let kind = if entry_type == 'p' {
1351 EntryKind::NamedPipe
1352 } else {
1353 EntryKind::Socket
1354 };
1355 entries.push(meta_entry(join_path(&dir_stack, &name), kind, &inode, None));
1356 }
1357 _ => break, // unknown type = slice trailer or unhandled entry
1358 }
1359 }
1360
1361 Ok((entries, complete))
1362}
1363
1364/// The file-specific catalog fields that follow a file inode.
1365struct FileFields {
1366 size: u64,
1367 archive_offset: u64,
1368 stored_size: u64,
1369 compression: u8,
1370 crc: Option<Vec<u8>>,
1371}
1372
1373/// Read the file-specific catalog fields after the inode. Layout differs by
1374/// format (libdar cat_file.cpp / crc.cpp):
1375/// - 8+: storage_size · file_data_status(1) · comp(1) · length-prefixed CRC.
1376/// - 2-7: storage_size · fixed 2-byte CRC; no status/comp byte — the
1377/// archive-global codec applies.
1378/// - 1: size · offset only; storage_size synthesised, global codec applies.
1379fn read_file_fields<R: Read + Seek>(
1380 r: &mut R,
1381 format_major: u32,
1382 global_comp: u8,
1383) -> Result<FileFields, DarError> {
1384 let size = read_infinint(r)?;
1385 let archive_offset = read_infinint(r)?;
1386 let (mut stored_size, compression, crc) = if format_major >= 8 {
1387 let ss = read_infinint(r)?;
1388 let _file_data_status = read_u8(r)?;
1389 let comp = read_u8(r)?;
1390 let crc = read_crc(r)?; // infinint width + that many raw bytes
1391 (ss, comp, crc)
1392 } else if format_major >= 2 {
1393 let ss = read_infinint(r)?;
1394 let mut crcbuf = [0u8; 2]; // legacy: fixed 2-byte CRC, no width prefix
1395 r.read_exact(&mut crcbuf)?;
1396 (ss, global_comp, Some(crcbuf.to_vec()))
1397 } else {
1398 (size, global_comp, None) // format 1: storage_size synthesised, no CRC
1399 };
1400 // Pre-8: storage_size 0 means the data is stored uncompressed.
1401 if format_major <= 7 && stored_size == 0 {
1402 stored_size = size;
1403 }
1404 Ok(FileFields {
1405 size,
1406 archive_offset,
1407 stored_size,
1408 compression,
1409 crc,
1410 })
1411}
1412
1413/// Read a format-8+ length-prefixed CRC: an infinint width then that many raw
1414/// bytes. A zero width (abnormal — libdar uses >= 1) yields `None`; a width past
1415/// [`MAX_CRC_SIZE`] is rejected as corrupt (allocation-bomb guard).
1416fn read_crc<R: Read>(r: &mut R) -> Result<Option<Vec<u8>>, DarError> {
1417 let crc_size = read_infinint(r)?;
1418 if crc_size == 0 {
1419 return Ok(None);
1420 }
1421 if crc_size > MAX_CRC_SIZE {
1422 return Err(DarError::Corrupt(format!(
1423 "CRC width {crc_size} exceeds {MAX_CRC_SIZE}-byte bound"
1424 )));
1425 }
1426 let mut buf = vec![0u8; crc_size as usize];
1427 r.read_exact(&mut buf)?;
1428 Ok(Some(buf))
1429}
1430
1431/// libdar's per-file CRC: an XOR-fold of `data` into a `width`-byte accumulator,
1432/// byte `i` into slot `i mod width` (zero-init, read out slot 0 first; no final
1433/// transform). `width` must be non-zero (a zero-width CRC is treated as absent).
1434fn dar_crc(data: &[u8], width: usize) -> Vec<u8> {
1435 let mut acc = vec![0u8; width];
1436 for (i, &b) in data.iter().enumerate() {
1437 acc[i % width] ^= b;
1438 }
1439 acc
1440}
1441
1442/// Lowercase hex encoding of `bytes`.
1443fn to_hex(bytes: &[u8]) -> String {
1444 const HEX: [u8; 16] = *b"0123456789abcdef";
1445 let mut s = String::with_capacity(bytes.len() * 2);
1446 for &b in bytes {
1447 // Each nibble is masked to 0..16, so the table index can never be out of
1448 // bounds — panic-free without `unwrap`.
1449 s.push(HEX[(b >> 4) as usize] as char);
1450 s.push(HEX[(b & 0xf) as usize] as char);
1451 }
1452 s
1453}
1454
1455/// Join a directory stack and a leaf name into a `/`-separated raw-byte path.
1456fn join_path(stack: &[Vec<u8>], name: &[u8]) -> Vec<u8> {
1457 let mut path = Vec::new();
1458 for component in stack {
1459 path.extend_from_slice(component);
1460 path.push(b'/');
1461 }
1462 path.extend_from_slice(name);
1463 path
1464}
1465
1466/// Build an `EntryRef` for a non-file inode (dir/symlink/pipe/socket): it carries
1467/// metadata but no archive data.
1468fn meta_entry(
1469 path: Vec<u8>,
1470 kind: EntryKind,
1471 inode: &Inode,
1472 symlink_target: Option<Vec<u8>>,
1473) -> EntryRef {
1474 EntryRef {
1475 path,
1476 kind,
1477 size: 0,
1478 uid: inode.uid,
1479 gid: inode.gid,
1480 mode: inode.mode,
1481 atime: inode.atime,
1482 mtime: inode.mtime,
1483 ctime: inode.ctime,
1484 symlink_target,
1485 archive_offset: 0,
1486 stored_size: 0,
1487 compression: b'n',
1488 crc: None,
1489 }
1490}
1491
1492// ── Low-level I/O helpers ─────────────────────────────────────────────────────
1493
1494/// Read a DAR variable-length infinint, decoded to `u64`.
1495///
1496/// Format (TG=4): optional leading `0x00` skip-bytes, then a terminal byte
1497/// with exactly one bit set; `pos = terminal.leading_zeros()` and the value
1498/// occupies `(skip_count * 8 + pos + 1) * 4` big-endian bytes.
1499///
1500/// A `u64` holds at most 8 data bytes. Any encoding wider than that — i.e.
1501/// *any* leading `0x00` (which alone implies ≥ 36 bytes) or a terminal below
1502/// `0x40` (`pos > 1`) — cannot be represented and is rejected as `Corrupt`
1503/// rather than silently truncated. This single bound also removes the
1504/// `(skip * 8 …)` arithmetic-overflow panic and caps the leading-zero scan, so
1505/// a malicious all-zero run can never spin or overflow the skip counter.
1506fn read_infinint<R: Read>(r: &mut R) -> Result<u64, DarError> {
1507 let terminal = read_u8(r)?;
1508 if terminal == 0x00 {
1509 // A skip-byte group is at least 36 data bytes — far beyond u64.
1510 return Err(DarError::Corrupt(
1511 "infinint exceeds 64-bit range (multi-group encoding)".into(),
1512 ));
1513 }
1514 if terminal.count_ones() != 1 {
1515 return Err(DarError::Corrupt(format!(
1516 "invalid infinint terminal: {terminal:#04x}"
1517 )));
1518 }
1519 let pos = terminal.leading_zeros(); // 0 ..= 7
1520 if pos > 1 {
1521 // data_bytes = (pos + 1) * 4 > 8 → does not fit in u64.
1522 return Err(DarError::Corrupt(format!(
1523 "infinint exceeds 64-bit range: terminal {terminal:#04x} implies {} bytes",
1524 (pos + 1) * 4
1525 )));
1526 }
1527 let data_bytes = (pos + 1) * 4; // 4 (terminal 0x80) or 8 (terminal 0x40)
1528 let mut val: u64 = 0;
1529 for _ in 0..data_bytes {
1530 val = (val << 8) | u64::from(read_u8(r)?);
1531 }
1532 Ok(val)
1533}
1534
1535fn read_u8<R: Read>(r: &mut R) -> Result<u8, DarError> {
1536 let mut b = [0u8; 1];
1537 r.read_exact(&mut b)?;
1538 Ok(b[0])
1539}
1540
1541/// Upper bound on a NUL-terminated path/name field. Real DAR entries stay
1542/// well under this; the cap stops a NUL-free region of a hostile archive from
1543/// growing the buffer until EOF (or OOM on a multi-GiB stream).
1544const MAX_NUL_STRING: usize = 64 * 1024;
1545
1546/// Read a NUL-terminated byte string (raw, not UTF-8 validated), consuming the
1547/// NUL. Length-capped at `MAX_NUL_STRING` so a NUL-free hostile region can't grow
1548/// the buffer to EOF.
1549fn read_nul_bytes<R: Read>(r: &mut R) -> Result<Vec<u8>, DarError> {
1550 let mut bytes = Vec::new();
1551 loop {
1552 let b = read_u8(r)?;
1553 if b == 0 {
1554 break;
1555 }
1556 if bytes.len() >= MAX_NUL_STRING {
1557 return Err(DarError::Corrupt(format!(
1558 "NUL-terminated string exceeds {MAX_NUL_STRING} bytes"
1559 )));
1560 }
1561 bytes.push(b);
1562 }
1563 Ok(bytes)
1564}
1565
1566/// Skip a NUL-terminated string without collecting the bytes.
1567fn skip_nul_string<R: Read>(r: &mut R) -> Result<(), DarError> {
1568 let mut len: usize = 0;
1569 loop {
1570 if read_u8(r)? == 0 {
1571 return Ok(());
1572 }
1573 len += 1;
1574 if len > MAX_NUL_STRING {
1575 return Err(DarError::Corrupt(format!(
1576 "NUL-terminated string exceeds {MAX_NUL_STRING} bytes"
1577 )));
1578 }
1579 }
1580}
1581
1582/// Seek past `n` bytes.
1583fn skip<R: Seek>(r: &mut R, n: u64) -> Result<(), DarError> {
1584 if n > 0 {
1585 // `SeekFrom::Current` takes an i64; a value above i64::MAX would cast to
1586 // a negative offset and seek *backwards* (re-reading earlier bytes on a
1587 // File). No real DAR field is that large — reject it outright.
1588 let off = i64::try_from(n)
1589 .map_err(|_| DarError::Corrupt(format!("skip length {n} exceeds seekable range")))?;
1590 r.seek(SeekFrom::Current(off)).map_err(DarError::Io)?;
1591 }
1592 Ok(())
1593}
1594
1595/// Skip one DAR timestamp field.
1596///
1597/// Timestamps are prefixed with a type byte:
1598/// - `'s'` (0x73) and others: seconds only — one infinint follows
1599/// - `'n'` (0x6e): nanosecond precision — two infinints follow (seconds + nanoseconds)
1600fn read_timestamp<R: Read + Seek>(r: &mut R, format_major: u32) -> Result<i64, DarError> {
1601 // Format 8 and earlier store a bare seconds infinint with NO precision byte
1602 // (libdar datetime.cpp:372). Format 9+ prefix a unit byte ('s' seconds,
1603 // 'u' microsecond, 'n' nanosecond); sub-second units add a second infinint,
1604 // which we read and discard (seconds resolution is what we expose).
1605 if format_major < 9 {
1606 return Ok(read_infinint(r)? as i64);
1607 }
1608 let ts_type = read_u8(r)?;
1609 let secs = read_infinint(r)? as i64;
1610 if ts_type == b'n' || ts_type == b'u' {
1611 read_infinint(r)?;
1612 }
1613 Ok(secs)
1614}
1615
1616/// Read a 2-byte big-endian `u16` (uid/gid for format <= 7, and permission bits).
1617fn read_u16<R: Read>(r: &mut R) -> Result<u16, DarError> {
1618 let mut b = [0u8; 2];
1619 r.read_exact(&mut b)?;
1620 Ok(u16::from_be_bytes(b))
1621}
1622
1623/// Decoded inode metadata shared by every catalog entry type.
1624struct Inode {
1625 flags: u8,
1626 uid: u64,
1627 gid: u64,
1628 mode: u16,
1629 atime: i64,
1630 mtime: i64,
1631 ctime: Option<i64>,
1632}
1633
1634/// Read one inode's base fields and return them. Layout in order: an optional
1635/// flags byte (format 2+), uid, gid, a `u16` perms field, atime, mtime, and a
1636/// ctime for format 8+. uid/gid are a 2-byte `u16` for format `<= 7` and an
1637/// infinint for 8+; each timestamp is decoded by [`read_timestamp`]. FSA inode
1638/// fields (format 9+, when flag bit `0x10` is set) are consumed and discarded.
1639fn read_inode_base<R: Read + Seek>(r: &mut R, format_major: u32) -> Result<Inode, DarError> {
1640 // Format 1 predates extended attributes and has NO leading flag byte
1641 // (libdar cat_inode.cpp); formats 2+ store it. Synthesise 0 for format 1.
1642 let flags = if format_major >= 2 { read_u8(r)? } else { 0 };
1643 // uid/gid: 2-byte u16 for format <= 7 (libdar cat_inode.cpp:171), infinint for 8+.
1644 let (uid, gid) = if format_major <= 7 {
1645 (u64::from(read_u16(r)?), u64::from(read_u16(r)?))
1646 } else {
1647 (read_infinint(r)?, read_infinint(r)?)
1648 };
1649 let mode = read_u16(r)?; // perms: a 2-byte big-endian u16, never an infinint
1650 let atime = read_timestamp(r, format_major)?;
1651 let mtime = read_timestamp(r, format_major)?;
1652 // ctime (last_cha) exists only from format 8 (libdar cat_inode.cpp:197).
1653 let ctime = if format_major >= 8 {
1654 Some(read_timestamp(r, format_major)?)
1655 } else {
1656 None
1657 };
1658 // FSA inode fields exist only from format 9 (libdar cat_inode.cpp:264); bit
1659 // 0x10 is the FSA-full status. Formats <= 8 have no FSA.
1660 if format_major >= 9 && (flags >> 4) & 1 != 0 {
1661 read_infinint(r)?;
1662 read_infinint(r)?;
1663 }
1664 Ok(Inode {
1665 flags,
1666 uid,
1667 gid,
1668 mode,
1669 atime,
1670 mtime,
1671 ctime,
1672 })
1673}
1674
1675/// Skip one FSA (filesystem attributes) block.
1676///
1677/// Format: infinint(family_tag) + infinint(data_size) + data_size bytes.
1678fn skip_fsa<R: Read + Seek>(r: &mut R) -> Result<(), DarError> {
1679 let _tag = read_infinint(r)?;
1680 let size = read_infinint(r)?;
1681 skip(r, size)
1682}
1683
1684// ── Unit tests ────────────────────────────────────────────────────────────────
1685
1686#[cfg(test)]
1687mod tests {
1688 use super::*;
1689 use std::io::Cursor;
1690
1691 // ── SliceReader truncated-slice guard ─────────────────────────────────────
1692
1693 #[test]
1694 fn slicereader_stops_on_truncated_slice() {
1695 use std::io::Read;
1696 // A span claiming more bytes than its file holds (only constructible
1697 // internally — `open` always measures the real file). Reading must stop at
1698 // the real EOF instead of spinning on the missing tail.
1699 let path = std::env::temp_dir().join(format!("dar_ms_trunc_{}.bin", std::process::id()));
1700 std::fs::write(&path, [1u8, 2, 3, 4]).unwrap();
1701 let mut sr = SliceReader {
1702 slices: vec![SliceSpan {
1703 file: File::open(&path).unwrap(),
1704 file_data_start: 0,
1705 logical_start: 0,
1706 logical_len: 100, // lies: only 4 bytes exist
1707 }],
1708 pos: 0,
1709 total: 100,
1710 };
1711 let mut buf = [0u8; 50];
1712 assert_eq!(sr.read(&mut buf).unwrap(), 4);
1713 assert_eq!(&buf[..4], &[1, 2, 3, 4]);
1714 let _ = std::fs::remove_file(&path);
1715 }
1716
1717 // ── read_infinint ─────────────────────────────────────────────────────────
1718
1719 #[test]
1720 fn infinint_decodes_value() {
1721 let data = [0x80u8, 0x00, 0x00, 0x00, 0x0d];
1722 assert_eq!(read_infinint(&mut Cursor::new(&data[..])).unwrap(), 13);
1723 }
1724
1725 #[test]
1726 fn infinint_bad_preamble_returns_corrupt() {
1727 // 0x03 = two bits set — not a valid infinint terminal.
1728 let data = [0x03u8, 0x00, 0x00, 0x00, 0x00];
1729 let err = read_infinint(&mut Cursor::new(&data[..])).unwrap_err();
1730 assert!(matches!(&err, DarError::Corrupt(_)));
1731 }
1732
1733 #[test]
1734 fn infinint_truncated_returns_io() {
1735 // Only 2 bytes — read_exact needs 5.
1736 let err = read_infinint(&mut Cursor::new(&[0x80u8, 0x00][..])).unwrap_err();
1737 assert!(matches!(err, DarError::Io(_)));
1738 }
1739
1740 #[test]
1741 fn infinint_0x40_preamble_reads_8_data_bytes() {
1742 // 0x40 terminal: leading_zeros=1, pos=1, data_bytes=(0*8+1+1)*4=8
1743 // Encodes the value 0x5d15_9331 in 8 big-endian bytes.
1744 let mut data = vec![0x40u8];
1745 data.extend_from_slice(&[0x00, 0x00, 0x00, 0x00, 0x5d, 0x15, 0x93, 0x31]);
1746 assert_eq!(
1747 read_infinint(&mut Cursor::new(data)).unwrap(),
1748 0x5d15_9331u64
1749 );
1750 }
1751
1752 #[test]
1753 fn infinint_multi_bit_terminal_returns_corrupt() {
1754 // 0x60 = 0110_0000 — two bits set, not a valid terminal.
1755 let data = [0x60u8, 0x00, 0x00, 0x00, 0x00];
1756 let err = read_infinint(&mut Cursor::new(&data[..])).unwrap_err();
1757 assert!(matches!(&err, DarError::Corrupt(_)));
1758 }
1759
1760 // ── read_u8 ───────────────────────────────────────────────────────────────
1761
1762 #[test]
1763 fn read_u8_reads_single_byte() {
1764 assert_eq!(read_u8(&mut Cursor::new(&[0x42u8][..])).unwrap(), 0x42);
1765 }
1766
1767 #[test]
1768 fn read_u8_eof_returns_io() {
1769 let err = read_u8(&mut Cursor::new(&[][..])).unwrap_err();
1770 assert!(matches!(err, DarError::Io(_)));
1771 }
1772
1773 // ── read_nul_bytes ──────────────────────────────────────────────────────
1774
1775 #[test]
1776 fn nul_bytes_reads_until_nul() {
1777 let data = b"hello\x00world";
1778 assert_eq!(
1779 read_nul_bytes(&mut Cursor::new(&data[..])).unwrap(),
1780 b"hello"
1781 );
1782 }
1783
1784 #[test]
1785 fn nul_bytes_preserves_non_utf8() {
1786 // Raw bytes are kept verbatim — a non-UTF-8 name must NOT be rejected.
1787 let data = [0xFF, 0x80, 0x00];
1788 assert_eq!(
1789 read_nul_bytes(&mut Cursor::new(&data[..])).unwrap(),
1790 vec![0xFF, 0x80]
1791 );
1792 }
1793
1794 #[test]
1795 fn nul_bytes_eof_before_nul_returns_io() {
1796 let err = read_nul_bytes(&mut Cursor::new(b"no-nul".to_vec())).unwrap_err();
1797 assert!(matches!(err, DarError::Io(_)));
1798 }
1799
1800 // ── skip_nul_string ───────────────────────────────────────────────────────
1801
1802 #[test]
1803 fn skip_nul_string_advances_past_nul() {
1804 let data = b"skip\x00rest";
1805 let mut c = Cursor::new(data.to_vec());
1806 skip_nul_string(&mut c).unwrap();
1807 assert_eq!(c.position(), 5); // "skip\0" = 5 bytes consumed
1808 }
1809
1810 #[test]
1811 fn skip_nul_string_eof_returns_io() {
1812 let err = skip_nul_string(&mut Cursor::new(b"no-nul".to_vec())).unwrap_err();
1813 assert!(matches!(err, DarError::Io(_)));
1814 }
1815
1816 // ── find_catalogue ────────────────────────────────────────────────────────
1817
1818 #[test]
1819 fn find_catalogue_body_too_short() {
1820 // Fewer than 6 bytes — can't fill the initial window; label also too short.
1821 let label = [0u8; 10];
1822 let err = find_catalogue(&mut Cursor::new(&[0x01u8, 0x02, 0x03][..]), &label).unwrap_err();
1823 assert!(
1824 matches!(&err, DarError::Corrupt(s) if s == "archive body too short"
1825 || s == "seqt_catalogue not found")
1826 );
1827 }
1828
1829 #[test]
1830 fn find_catalogue_escape_at_start() {
1831 let mut data = [0xAD, 0xFD, 0xEA, 0x77, 0x21, 0x43, 0xFF];
1832 let mut c = Cursor::new(&mut data[..]);
1833 let via_escape = find_catalogue(&mut c, &[0u8; 10]).unwrap();
1834 assert!(via_escape);
1835 assert_eq!(c.position(), 6);
1836 }
1837
1838 #[test]
1839 fn find_catalogue_escape_not_found() {
1840 // 10 bytes of zeros, label is 0xFF×10 so label scan also fails.
1841 let label = [0xFFu8; 10];
1842 let err = find_catalogue(&mut Cursor::new(&[0u8; 10][..]), &label).unwrap_err();
1843 assert!(matches!(&err, DarError::Corrupt(s) if s == "seqt_catalogue not found"));
1844 }
1845
1846 #[test]
1847 fn find_catalogue_label_fallback() {
1848 let label: [u8; 10] = [0xA1, 0xB2, 0xC3, 0xD4, 0xE5, 0xF6, 0x07, 0x18, 0x29, 0x3A];
1849 // Prefix junk (no escape) followed by the label bytes.
1850 let mut data = vec![0x00u8; 5];
1851 data.extend_from_slice(&label);
1852 let mut c = Cursor::new(data);
1853 let via_escape = find_catalogue(&mut c, &label).unwrap();
1854 assert!(!via_escape);
1855 assert_eq!(c.position(), 15); // 5 junk + 10 label consumed
1856 }
1857
1858 // ── skip ──────────────────────────────────────────────────────────────────
1859
1860 #[test]
1861 fn skip_zero_does_not_move_cursor() {
1862 let mut c = Cursor::new(vec![0xFFu8; 10]);
1863 skip(&mut c, 0).unwrap();
1864 assert_eq!(c.position(), 0);
1865 }
1866
1867 #[test]
1868 fn skip_n_advances_cursor() {
1869 let mut c = Cursor::new(vec![0xFFu8; 10]);
1870 skip(&mut c, 7).unwrap();
1871 assert_eq!(c.position(), 7);
1872 }
1873
1874 // ── read_inode_base ───────────────────────────────────────────────────────
1875
1876 #[test]
1877 fn inode_base_bit4_clear_reads_31_bytes() {
1878 // flags(1) + uid(5) + gid(5) + perms(2) + 3×[type(1)+secs(5)] = 31 bytes
1879 let mut data = vec![0x00u8]; // flags (bit4=0)
1880 data.extend_from_slice(&[0x80, 0x00, 0x00, 0x00, 0x00]); // uid
1881 data.extend_from_slice(&[0x80, 0x00, 0x00, 0x00, 0x00]); // gid
1882 data.extend_from_slice(&[0x00, 0x00]); // perms
1883 for _ in 0..3 {
1884 data.push(b's'); // timestamp type
1885 data.extend_from_slice(&[0x80, 0x00, 0x00, 0x00, 0x00]); // seconds
1886 }
1887 data.push(0xFF); // sentinel — must not be consumed
1888 let mut c = Cursor::new(data);
1889 assert_eq!(read_inode_base(&mut c, 11).unwrap().flags, 0x00);
1890 assert_eq!(c.position(), 31);
1891 }
1892
1893 #[test]
1894 fn inode_base_bit4_set_reads_41_bytes() {
1895 // flags(1) + uid(5) + gid(5) + perms(2) + 3×[type(1)+secs(5)] + nlink(5) + field9(5) = 41
1896 let mut data = vec![0x10u8]; // flags (bit4=1)
1897 data.extend_from_slice(&[0x80, 0x00, 0x00, 0x00, 0x00]); // uid
1898 data.extend_from_slice(&[0x80, 0x00, 0x00, 0x00, 0x00]); // gid
1899 data.extend_from_slice(&[0x00, 0x00]); // perms
1900 for _ in 0..3 {
1901 data.push(b's');
1902 data.extend_from_slice(&[0x80, 0x00, 0x00, 0x00, 0x00]);
1903 }
1904 data.extend_from_slice(&[0x80, 0x00, 0x00, 0x00, 0x00]); // nlink
1905 data.extend_from_slice(&[0x80, 0x00, 0x00, 0x00, 0x00]); // field9
1906 data.push(0xFF); // sentinel
1907 let mut c = Cursor::new(data);
1908 assert_eq!(read_inode_base(&mut c, 11).unwrap().flags, 0x10);
1909 assert_eq!(c.position(), 41);
1910 }
1911
1912 // ── skip_fsa ─────────────────────────────────────────────────────────────
1913
1914 #[test]
1915 fn skip_fsa_consumes_tag_size_and_data() {
1916 // tag=infinint(5) + size=infinint(3) + 3 data bytes
1917 let mut data = Vec::new();
1918 data.extend_from_slice(&[0x80, 0x00, 0x00, 0x00, 0x05]); // tag
1919 data.extend_from_slice(&[0x80, 0x00, 0x00, 0x00, 0x03]); // size=3
1920 data.extend_from_slice(&[0xAA, 0xBB, 0xCC]); // data
1921 data.push(0xFF); // sentinel
1922 let mut c = Cursor::new(data);
1923 skip_fsa(&mut c).unwrap();
1924 assert_eq!(c.position(), 13); // 5 + 5 + 3 = 13
1925 }
1926
1927 // ── hardening: malicious / corrupted infinint encodings ───────────────────
1928 //
1929 // A `u64` holds at most 8 data bytes. The reader's contract is "decode to
1930 // u64 or return Corrupt" — it must never silently truncate an over-wide
1931 // value, overflow while computing the byte count, or loop on a zero run.
1932
1933 #[test]
1934 fn infinint_leading_zero_byte_returns_corrupt() {
1935 // A leading 0x00 skip-byte implies a ≥36-byte group — far beyond u64.
1936 // Must be rejected as Corrupt, not mislabelled as an I/O shortage.
1937 let data = [0x00u8, 0x80, 0x00, 0x00, 0x00, 0x00];
1938 let err = read_infinint(&mut Cursor::new(&data[..])).unwrap_err();
1939 assert!(matches!(err, DarError::Corrupt(_)), "got {err:?}");
1940 }
1941
1942 #[test]
1943 fn infinint_12_byte_group_exceeds_u64_returns_corrupt() {
1944 // 0x20 terminal → pos=2 → 12 data bytes → cannot fit in u64.
1945 // Must error rather than silently truncate to a wrong value.
1946 let mut data = vec![0x20u8];
1947 data.extend_from_slice(&[0x11; 12]);
1948 let err = read_infinint(&mut Cursor::new(data)).unwrap_err();
1949 assert!(matches!(err, DarError::Corrupt(_)), "got {err:?}");
1950 }
1951
1952 #[test]
1953 fn infinint_all_zero_run_returns_corrupt_without_hanging() {
1954 // A run of zero bytes must terminate promptly with Corrupt, never spin
1955 // consuming the whole stream (and never overflow-panic the skip count).
1956 let data = vec![0u8; 4096];
1957 let err = read_infinint(&mut Cursor::new(data)).unwrap_err();
1958 assert!(matches!(err, DarError::Corrupt(_)), "got {err:?}");
1959 }
1960
1961 // ── hardening: unbounded NUL-terminated strings ───────────────────────────
1962
1963 #[test]
1964 fn nul_bytes_without_terminator_is_length_bounded() {
1965 // No NUL in 200 KiB of data: must be rejected once the path cap is hit,
1966 // not grow the buffer until EOF (or OOM on a multi-GiB stream).
1967 let data = vec![b'A'; 200_000];
1968 let err = read_nul_bytes(&mut Cursor::new(data)).unwrap_err();
1969 assert!(matches!(err, DarError::Corrupt(_)), "got {err:?}");
1970 }
1971
1972 #[test]
1973 fn skip_nul_string_without_terminator_is_length_bounded() {
1974 let data = vec![b'A'; 200_000];
1975 let err = skip_nul_string(&mut Cursor::new(data)).unwrap_err();
1976 assert!(matches!(err, DarError::Corrupt(_)), "got {err:?}");
1977 }
1978
1979 // ── hardening: skip must never seek backwards ─────────────────────────────
1980
1981 #[test]
1982 fn skip_value_above_i64_max_returns_corrupt() {
1983 // n > i64::MAX casts to a negative i64 → SeekFrom::Current would seek
1984 // *backwards* on a File (re-reading earlier bytes). Must be rejected,
1985 // and the stream position must not move.
1986 let mut c = Cursor::new(vec![0u8; 64]);
1987 c.set_position(32);
1988 let err = skip(&mut c, 0x8000_0000_0000_0000).unwrap_err();
1989 assert!(matches!(err, DarError::Corrupt(_)), "got {err:?}");
1990 assert_eq!(c.position(), 32); // unchanged on a rejected skip
1991 }
1992
1993 // ── terminateur trailer (pre-8 catalog locator) ───────────────────────────
1994
1995 #[test]
1996 fn terminateur_reads_catalogue_offset() {
1997 // pos infinint 0x18 = 24; terminator 0xc0 → two leading ones → 2*4 = 8
1998 // bytes back to the infinint.
1999 let data = vec![0x80u8, 0x00, 0x00, 0x00, 0x18, 0x00, 0x00, 0x00, 0xc0];
2000 assert_eq!(read_terminateur(&mut Cursor::new(data)).unwrap(), 24);
2001 }
2002
2003 #[test]
2004 fn terminateur_all_ff_underflows_returns_corrupt() {
2005 let err = read_terminateur(&mut Cursor::new(vec![0xFFu8; 4])).unwrap_err();
2006 assert!(matches!(err, DarError::Corrupt(_)), "got {err:?}");
2007 }
2008
2009 #[test]
2010 fn terminateur_excessive_ff_padding_returns_corrupt() {
2011 let err = read_terminateur(&mut Cursor::new(vec![0xFFu8; 600])).unwrap_err();
2012 assert!(matches!(err, DarError::Corrupt(_)), "got {err:?}");
2013 }
2014
2015 #[test]
2016 fn terminateur_low_terminator_byte_returns_corrupt() {
2017 // Terminator byte 0x01 has no top bit set.
2018 let data = vec![0x80u8, 0x00, 0x00, 0x00, 0x18, 0x01];
2019 let err = read_terminateur(&mut Cursor::new(data)).unwrap_err();
2020 assert!(matches!(err, DarError::Corrupt(_)), "got {err:?}");
2021 }
2022
2023 #[test]
2024 fn terminateur_noncontiguous_high_bits_returns_corrupt() {
2025 // 0xA0 = 1010_0000: top bit set but the high-bit run is not contiguous.
2026 let data = vec![0x80u8, 0x00, 0x00, 0x00, 0x18, 0xA0];
2027 let err = read_terminateur(&mut Cursor::new(data)).unwrap_err();
2028 assert!(matches!(err, DarError::Corrupt(_)), "got {err:?}");
2029 }
2030
2031 // ── find_catalogue: full-scan fallback + body-too-short ────────────────────
2032
2033 #[test]
2034 fn find_catalogue_falls_back_to_full_scan() {
2035 // Escape near the start; a tiny tail window misses it, forcing the
2036 // archive_origin full-scan fallback.
2037 let mut data = vec![0x11u8, 0x22]; // junk before the escape
2038 data.extend_from_slice(&SEQT_CATALOGUE);
2039 data.extend_from_slice(&[0x33u8; 12]); // trailing bytes beyond the tail window
2040 let mut c = Cursor::new(data);
2041 let via_escape = find_catalogue_within(&mut c, &[0u8; 10], 4).unwrap();
2042 assert!(via_escape);
2043 assert_eq!(c.position(), 2 + SEQT_CATALOGUE.len() as u64);
2044 }
2045
2046 #[test]
2047 fn find_catalogue_full_scan_miss_returns_not_found() {
2048 // No escape and no matching label anywhere; a tiny tail window forces
2049 // the full-scan fallback, which also misses → "not found".
2050 let mut c = Cursor::new(vec![0x11u8; 16]);
2051 let err = find_catalogue_within(&mut c, &[0xABu8; 10], 4).unwrap_err();
2052 assert!(matches!(&err, DarError::Corrupt(s) if s == "seqt_catalogue not found"));
2053 }
2054
2055 #[test]
2056 fn find_catalogue_body_too_short_when_origin_at_eof() {
2057 let mut c = Cursor::new(vec![0u8; 6]);
2058 c.seek(SeekFrom::Start(6)).unwrap();
2059 let err = find_catalogue(&mut c, &[0u8; 10]).unwrap_err();
2060 assert!(matches!(&err, DarError::Corrupt(s) if s == "archive body too short"));
2061 }
2062
2063 // ── decode_stream / CapWriter ────────────────────────────────────────────
2064
2065 #[test]
2066 fn decode_stream_caps_decompression_bomb() {
2067 use flate2::{write::ZlibEncoder, Compression};
2068 use std::io::Write;
2069 let mut enc = ZlibEncoder::new(Vec::new(), Compression::default());
2070 enc.write_all(&[0u8; 4096]).unwrap();
2071 let blob = enc.finish().unwrap();
2072 // Inflates to 4096 bytes but the CapWriter caps output at 16.
2073 let mut sink = Vec::new();
2074 let mut cap = CapWriter {
2075 inner: &mut sink,
2076 written: 0,
2077 max: 16,
2078 };
2079 let err = decode_stream(&blob[..], b'z', &mut cap).unwrap_err();
2080 assert!(matches!(&err, DarError::Corrupt(s) if s.contains("exceeds bound")));
2081 }
2082
2083 #[test]
2084 fn decode_stream_rejects_malformed_zlib() {
2085 let err = decode_stream(
2086 b"not a zlib stream at all".as_slice(),
2087 b'z',
2088 &mut Vec::new(),
2089 )
2090 .unwrap_err();
2091 assert!(matches!(&err, DarError::Corrupt(s) if s.contains("zlib decode failed")));
2092 }
2093
2094 #[test]
2095 fn decode_stream_rejects_malformed_bzip2() {
2096 let err =
2097 decode_stream(b"not a bzip2 stream".as_slice(), b'y', &mut Vec::new()).unwrap_err();
2098 assert!(matches!(&err, DarError::Corrupt(s) if s.contains("bzip2 decode failed")));
2099 }
2100
2101 #[test]
2102 fn decode_stream_rejects_malformed_xz() {
2103 let err = decode_stream(
2104 b"this is not an xz stream".as_slice(),
2105 b'x',
2106 &mut Vec::new(),
2107 )
2108 .unwrap_err();
2109 assert!(matches!(&err, DarError::Corrupt(s) if s.contains("xz decode failed")));
2110 }
2111
2112 #[test]
2113 fn decode_stream_rejects_malformed_zstd() {
2114 let err = decode_stream(b"not a zstd frame".as_slice(), b'd', &mut Vec::new()).unwrap_err();
2115 assert!(matches!(&err, DarError::Corrupt(s) if s.contains("zstd decode failed")));
2116 }
2117
2118 #[test]
2119 fn decode_stream_rejects_unknown_codec() {
2120 // No streamed codec routes here in a full build; a stray byte must error.
2121 let err = decode_stream(b"data".as_slice(), b'?', &mut Vec::new()).unwrap_err();
2122 assert!(
2123 matches!(&err, DarError::Corrupt(s) if s.contains("unrecognised compression codec"))
2124 );
2125 }
2126
2127 #[test]
2128 fn header_flags_single_two_byte_and_overlong() {
2129 // Single byte (low bit clear): value is `byte & 0xFE`.
2130 assert_eq!(read_header_flags(&mut [0x10u8].as_slice()).unwrap(), 0x10);
2131 // Two bytes (first low bit set = continuation): 0x09,0x08 -> 0x0808.
2132 assert_eq!(
2133 read_header_flags(&mut [0x09u8, 0x08].as_slice()).unwrap(),
2134 0x0808
2135 );
2136 // A field that never terminates within 8 bytes is rejected.
2137 let err = read_header_flags(&mut [0xFFu8; 9].as_slice()).unwrap_err();
2138 assert!(matches!(&err, DarError::Corrupt(s) if s.contains("flag field too large")));
2139 }
2140
2141 #[test]
2142 fn compr_bs_edition_one_is_zero() {
2143 // Edition < 2 has no flag field, hence no block size.
2144 assert_eq!(read_compr_bs(&mut b"cmdline\x00rest".as_slice(), 1), 0);
2145 }
2146
2147 #[test]
2148 fn compr_bs_read_after_initial_offset() {
2149 // cmd_line "\0" | flags 0x0808 (HAS_COMPRESS_BS + INITIAL_OFFSET) |
2150 // initial_offset (skipped) | compr_bs = 42.
2151 let mut buf = vec![0x00u8]; // empty command line
2152 buf.extend_from_slice(&[0x09, 0x08]); // flags = 0x0808
2153 buf.extend_from_slice(&[0x80, 0, 0, 0, 0]); // initial_offset = 0
2154 buf.extend_from_slice(&[0x80, 0, 0, 0, 42]); // compr_bs = 42
2155 assert_eq!(read_compr_bs(&mut buf.as_slice(), 11), 42);
2156 }
2157
2158 #[test]
2159 fn cap_writer_forwards_within_bound_and_fails_over() {
2160 use std::io::Write;
2161 let mut sink = Vec::new();
2162 let mut w = CapWriter {
2163 inner: &mut sink,
2164 written: 0,
2165 max: 4,
2166 };
2167 assert_eq!(w.write(b"ab").unwrap(), 2); // within bound
2168 w.flush().unwrap();
2169 let err = w.write(b"cde").unwrap_err(); // 2 + 3 > 4
2170 assert_eq!(err.to_string(), "decompressed data exceeds bound");
2171 assert_eq!(sink, b"ab");
2172 }
2173}