dar/
lib.rs

1//! Pure-Rust reader for Denis Corbin DAR (Disk ARchiver) archives.
2//!
3//! Supports DAR formats 7–11 (produced by dar 2.3–2.8) and the legacy ≤7 grammar.
4//! Passware Kit Mobile produces format-9 archives; dar 2.8.5 produces 11.3.
5//! Entries and the catalogue compressed with gzip, bzip2, xz, zstd, lz4 or lzo
6//! are transparently decompressed (pure-Rust; each an optional feature, all on by
7//! default); encryption is not decoded.
8//!
9//! ## Format sketch
10//!
11//! ```text
12//! Slice header:
13//!   [4]  magic = 00 00 00 7b  (SAUV_MAGIC_NUMBER = 123, big-endian u32)
14//!   [10] internal_name label
15//!   [1]  flag  [1]  ext_char
16//!   TLV list:  infinint(count) + count × (u16 type + infinint len + data)
17//!   ← archive_origin: all catalog archive_offset values are relative to here
18//!
19//! Archive body:
20//!   escaped sequences (seqt_file, seqt_saved, …) + raw file bytes
21//!
22//! Catalog  (located by seqt_catalogue escape: AD FD EA 77 21 43):
23//!   [10] label  +  (NUL working-dir path, format 11.1+ only)  +  entries
24//!
25//!   Each entry: cat_sig byte where (cat_sig & 0x1f | 0x60) gives type
26//!     'd' directory  → NUL-name + inode [+ FSA]  (push to dir stack)
27//!     'f' file       → NUL-name + inode [+ FSA] + file-specific fields
28//!     'z' EOD        → pop dir stack; depth=0 → done
29//! ```
30//!
31//! ## Key non-obvious invariants
32//!
33//! - **Infinint**: variable-length. The common form is 5 bytes
34//!   (`0x80 XX XX XX XX`, a big-endian u32); timestamps past 2^32 use the
35//!   9-byte `0x40` form (big-endian u64). Encodings wider than 64 bits are
36//!   rejected as corrupt — this reader decodes to `u64` or errors, never
37//!   truncates.
38//! - **Permissions**: 2-byte big-endian u16, *not* an infinint.
39//! - **Timestamps**: format 8 stores a bare seconds infinint; format 9+ prefix
40//!   a unit byte (`'s'`/`'u'`/`'n'`) and add a sub-second infinint for `'u'`/`'n'`.
41//! - **FSA** (format 9+ only): inode flag bit `0x10` (FSA-full) adds inode
42//!   infinints and an FSA block; format 8 has no FSA.
43//! - **archive_offset**: points *directly* to the raw file bytes, not to the
44//!   data-section header that precedes them in the body stream.
45//!   `seek(archive_origin + archive_offset)` then `read(stored_size)`.
46//!
47//! Full format notes: `docs/implementation-notes.md`.
48
49// Production code is panic-free (no unwrap/expect, enforced by the workspace
50// lints); tests legitimately use them.
51#![cfg_attr(test, allow(clippy::unwrap_used, clippy::expect_used))]
52
53use std::fs::File;
54use std::io::{Cursor, Read, Seek, SeekFrom, Write};
55use std::path::{Path, PathBuf};
56
57use thiserror::Error;
58
59/// `00 00 00 7b` — DAR magic (SAUV_MAGIC_NUMBER = 123, big-endian u32).
60const DAR_MAGIC: [u8; 4] = [0x00, 0x00, 0x00, 0x7b];
61
62/// Upper bound on the compressed catalogue bytes read from the archive tail and
63/// on the inflated catalogue, guarding against a decompression bomb (per-file
64/// streams need no such constant — they are bounded by the entry's known size).
65const MAX_CATALOGUE_COMPRESSED: u64 = 512 * 1024 * 1024;
66const MAX_CATALOGUE_INFLATED: u64 = 1024 * 1024 * 1024;
67
68/// Upper bound on a per-file CRC width (libdar uses 4 bytes per gigabyte, so
69/// 64 KiB covers a 16 TiB file); a larger declared width is treated as corrupt.
70const MAX_CRC_SIZE: u64 = 64 * 1024;
71
72/// Upper bound on the per-block uncompressed block size (`compr_bs`); a header
73/// declaring more is treated as not block-compressed (allocation-bomb guard).
74/// dar's default is 240 KiB; 256 MiB is far beyond any practical setting.
75const MAX_BLOCK_SIZE: u64 = 256 * 1024 * 1024;
76
77/// Escape sequence marking the catalog: `AD FD EA 77 21 43`.
78const SEQT_CATALOGUE: [u8; 6] = [0xAD, 0xFD, 0xEA, 0x77, 0x21, 0x43];
79
80/// First archive format with an in-place (working-directory) path in the
81/// catalog header — `archive_version(11,1)` → `value() = 11*256 + 1`.
82/// Formats 8, 9, 10 and 11.0 have no such field.
83const FORMAT_11_1: u32 = 11 * 256 + 1;
84
85/// Errors returned by [`DarReader`].
86#[derive(Debug, Error)]
87pub enum DarError {
88    #[error("I/O error: {0}")]
89    Io(#[from] std::io::Error),
90    #[error("not a DAR archive")]
91    NotADar,
92    #[error("corrupt archive: {0}")]
93    Corrupt(String),
94    #[error("entry not found: '{0}'")]
95    EntryNotFound(String),
96}
97
98/// Outcome of verifying a file entry's stored CRC against its decompressed data
99/// (see [`DarReader::verify`]). CRC values are lowercase hex.
100#[derive(Debug, Clone, PartialEq, Eq)]
101#[cfg_attr(feature = "serde", derive(serde::Serialize))]
102pub enum CrcStatus {
103    /// The stored CRC matches the data.
104    Match,
105    /// The stored CRC disagrees with the data — consistent with corruption or
106    /// tampering of the archived bytes.
107    Mismatch {
108        /// CRC recorded in the catalogue (lowercase hex).
109        stored: String,
110        /// CRC computed over the decompressed data (lowercase hex).
111        computed: String,
112    },
113    /// No CRC is stored for this entry (edition-1 archives record none), so
114    /// integrity cannot be checked.
115    NotStored,
116}
117
118impl core::fmt::Display for CrcStatus {
119    fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
120        match self {
121            CrcStatus::Match => f.write_str("CRC match"),
122            CrcStatus::Mismatch { stored, computed } => {
123                write!(f, "CRC mismatch: stored {stored}, computed {computed}")
124            }
125            CrcStatus::NotStored => f.write_str("no CRC stored"),
126        }
127    }
128}
129
130/// The kind of filesystem object a catalog entry describes.
131#[derive(Debug, Clone, Copy, PartialEq, Eq)]
132#[cfg_attr(feature = "serde", derive(serde::Serialize))]
133pub enum EntryKind {
134    File,
135    Directory,
136    Symlink,
137    NamedPipe,
138    Socket,
139    CharDevice,
140    BlockDevice,
141    Hardlink,
142    /// A catalog entry type this reader does not model (the raw `cat_sig` letter).
143    Unknown(char),
144}
145
146/// Metadata about one archived filesystem object.
147///
148/// Paths and symlink targets are exposed as raw bytes — DAR (like the
149/// filesystems it archives) does not guarantee UTF-8, and a forensic reader
150/// must never lose or reject a byte-exact name. Use [`DarEntry::path_lossy`] for
151/// display.
152#[derive(Debug, Clone)]
153#[cfg_attr(feature = "serde", derive(serde::Serialize))]
154pub struct DarEntry {
155    /// Path as stored, raw bytes — may not be valid UTF-8. In JSON this is the
156    /// lossy-UTF-8 display string (use the field directly for byte-exact data).
157    #[cfg_attr(feature = "serde", serde(serialize_with = "serialize_bytes_lossy"))]
158    pub path: Vec<u8>,
159    /// What kind of filesystem object this entry describes.
160    pub kind: EntryKind,
161    /// Uncompressed size in bytes (0 for entries with no data).
162    pub size: u64,
163    /// Owner user id.
164    pub uid: u64,
165    /// Owner group id.
166    pub gid: u64,
167    /// Permission bits (the low bits of the mode).
168    pub mode: u16,
169    /// Access time, seconds since the Unix epoch.
170    pub atime: i64,
171    /// Modification time, seconds since the Unix epoch.
172    pub mtime: i64,
173    /// Status-change time, seconds since the Unix epoch; `None` for formats
174    /// before 8, which do not record it.
175    pub ctime: Option<i64>,
176    /// Target of a symbolic link, raw bytes; `None` for non-symlinks. In JSON
177    /// this is the lossy-UTF-8 display string (or null).
178    #[cfg_attr(feature = "serde", serde(serialize_with = "serialize_opt_bytes_lossy"))]
179    pub symlink_target: Option<Vec<u8>>,
180}
181
182impl DarEntry {
183    /// The path decoded as lossy UTF-8 (invalid byte sequences become U+FFFD).
184    #[must_use]
185    pub fn path_lossy(&self) -> std::borrow::Cow<'_, str> {
186        String::from_utf8_lossy(&self.path)
187    }
188}
189
190/// Serialize raw path/target bytes as a lossy-UTF-8 string for JSON export.
191/// The byte-exact value remains available via the typed field; this is a
192/// human-readable display projection (serde_json escapes control characters).
193#[cfg(feature = "serde")]
194fn serialize_bytes_lossy<S: serde::Serializer>(bytes: &[u8], s: S) -> Result<S::Ok, S::Error> {
195    s.serialize_str(&String::from_utf8_lossy(bytes))
196}
197
198// serde's `serialize_with` calls this with `&self.field`, so the signature must
199// take `&Option<_>` (not `Option<&_>`); the lint does not apply here.
200#[cfg(feature = "serde")]
201#[allow(clippy::ref_option)]
202fn serialize_opt_bytes_lossy<S: serde::Serializer>(
203    target: &Option<Vec<u8>>,
204    s: S,
205) -> Result<S::Ok, S::Error> {
206    match target {
207        Some(bytes) => s.serialize_some(&String::from_utf8_lossy(bytes)),
208        None => s.serialize_none(),
209    }
210}
211
212#[derive(Debug, Clone)]
213struct EntryRef {
214    path: Vec<u8>,
215    kind: EntryKind,
216    size: u64,
217    uid: u64,
218    gid: u64,
219    mode: u16,
220    atime: i64,
221    mtime: i64,
222    ctime: Option<i64>,
223    symlink_target: Option<Vec<u8>>,
224    archive_offset: u64,
225    stored_size: u64,
226    compression: u8,
227    /// Stored per-file data CRC (raw bytes); `None` when the format records none
228    /// (edition 1) or the width is zero.
229    crc: Option<Vec<u8>>,
230}
231
232impl EntryRef {
233    /// Project the internal entry into the public [`DarEntry`] (one clone of the
234    /// owned path/target fields).
235    fn to_dar_entry(&self) -> DarEntry {
236        DarEntry {
237            path: self.path.clone(),
238            kind: self.kind,
239            size: self.size,
240            uid: self.uid,
241            gid: self.gid,
242            mode: self.mode,
243            atime: self.atime,
244            mtime: self.mtime,
245            ctime: self.ctime,
246            symlink_target: self.symlink_target.clone(),
247        }
248    }
249}
250
251/// Read-only DAR archive reader.
252pub struct DarReader<R: Read + Seek> {
253    inner: R,
254    /// Byte position immediately after the slice header TLV block.
255    /// `archive_origin + archive_offset` = absolute position of raw file bytes.
256    archive_origin: u64,
257    /// Archive format major version (`value() >> 8`). Format 1 stores no
258    /// per-entry `storage_size`, so a compressed format-1 entry is decoded by
259    /// streaming the codec to its natural end rather than reading a fixed length.
260    format_major: u32,
261    /// Whether the catalog parsed to a clean root EOD (see [`DarReader::is_complete`]).
262    complete: bool,
263    /// Uncompressed block size from the header (`FLAG_HAS_COMPRESS_BS`); non-zero
264    /// means the archive uses dar's per-block compression framing, zero means a
265    /// single codec stream. Governs both the catalogue and every entry.
266    compr_bs: u64,
267    entries: Vec<EntryRef>,
268}
269
270impl<R: Read + Seek> DarReader<R> {
271    // The archive/slice-header parser is one cohesive state machine; splitting
272    // it would scatter the format logic across helpers and hurt readability.
273    #[allow(clippy::too_many_lines)]
274    pub fn open(mut reader: R) -> Result<Self, DarError> {
275        let mut magic = [0u8; 4];
276        reader
277            .read_exact(&mut magic)
278            .map_err(|_| DarError::NotADar)?;
279        if magic != DAR_MAGIC {
280            return Err(DarError::NotADar);
281        }
282
283        let mut label = [0u8; 10];
284        reader.read_exact(&mut label)?; // internal_name label
285        let _flag = read_u8(&mut reader)?; // slice flag ('T' terminal / 'N' / 'E')
286        let extension = read_u8(&mut reader)?; // 'T' = TLV (format 8+); 'N'/'S' = legacy (<= 7)
287
288        // Format 8+ carries a TLV list and a `seqt_catalogue` escape; format <= 7
289        // has neither — its catalogue is located via the end `terminateur` trailer
290        // (libdar header.cpp extension handling; terminateur.cpp).
291        let entries;
292        let archive_origin;
293        let format_major;
294        let complete;
295        let compr_bs;
296        if extension == b'T' {
297            // TLV list: infinint(count) then count × (u16 type + infinint len + data)
298            let tlv_count = read_infinint(&mut reader).map_err(|e| match e {
299                DarError::Io(_) => DarError::Corrupt("truncated TLV block".into()),
300                other => other,
301            })?;
302            // The archive's data_name (TLV type 0x0003, a 10-byte label) is the
303            // identity the catalogue's ref_data_name points at. It is preserved
304            // when an archive is re-sliced (dar_xform) even though the slice's own
305            // internal_name changes, so it — not the slice label — locates a
306            // tape-marks-off catalogue. For a normally-created archive the two are
307            // identical, so this is a no-op there.
308            let mut data_name: Option<[u8; 10]> = None;
309            for _ in 0..tlv_count {
310                let mut typ = [0u8; 2];
311                reader.read_exact(&mut typ)?;
312                let len = read_infinint(&mut reader)?;
313                if typ == [0x00, 0x03] && len == 10 {
314                    let mut dn = [0u8; 10];
315                    reader.read_exact(&mut dn)?;
316                    data_name = Some(dn);
317                } else {
318                    skip(&mut reader, len)?;
319                }
320            }
321
322            archive_origin = reader.stream_position()?;
323            let format_value = read_format_value(&mut reader);
324            // The archive's global compression algorithm is the byte immediately
325            // after the version string; it tells us whether (and how) the
326            // catalogue stream is compressed. Unreadable → treat as stored.
327            let global_comp = read_u8(&mut reader).unwrap_or(b'n');
328            // The cursor now sits at the command-line string; read on to the
329            // compression block size (zero = single-stream, non-zero = per-block).
330            compr_bs = read_compr_bs(&mut reader, format_value >> 8);
331            reader.seek(SeekFrom::Start(archive_origin))?;
332
333            // true → seqt_catalogue tape mark found (catalog has label + maybe path);
334            // false → located by its ref_data_name label (tape marks off, e.g. Passware).
335            let via_escape = find_catalogue(&mut reader, data_name.as_ref().unwrap_or(&label))?;
336            format_major = format_value >> 8;
337            if via_escape && is_compressed(global_comp) {
338                // The catalogue is a single stream compressed with the archive
339                // codec, beginning right after the seqt_catalogue escape and
340                // running to the trailer. Inflate it, then parse from the
341                // plaintext buffer — which begins with the in-catalog label and
342                // optional in-place path, exactly like the uncompressed case.
343                let mut compressed = Vec::new();
344                reader
345                    .by_ref()
346                    .take(MAX_CATALOGUE_COMPRESSED)
347                    .read_to_end(&mut compressed)?;
348                let inflated = inflate_catalogue(&compressed, global_comp, compr_bs)?;
349                let mut cur = Cursor::new(inflated);
350                skip(&mut cur, 10)?; // catalog label
351                if format_value >= FORMAT_11_1 {
352                    skip_nul_string(&mut cur)?;
353                }
354                (entries, complete) = parse_catalog(&mut cur, format_major, global_comp)?;
355            } else {
356                // The catalogue opens with a 10-byte label and, from format 11.1,
357                // an in-place path NUL-string before the entries. When located by
358                // the seqt_catalogue escape the reader sits before the label; when
359                // located by ref_data_name match (tape marks off) scan_window has
360                // already consumed the matched label, so only the path remains.
361                if via_escape {
362                    skip(&mut reader, 10)?; // catalog label
363                }
364                if format_value >= FORMAT_11_1 {
365                    skip_nul_string(&mut reader)?;
366                }
367                (entries, complete) = parse_catalog(&mut reader, format_major, global_comp)?;
368            }
369        } else if extension == b'N' || extension == b'S' {
370            // Legacy editions (<= 7) predate block compression — always a stream.
371            compr_bs = 0;
372            if extension == b'S' {
373                read_infinint(&mut reader)?; // slice size (multi-slice header); unused
374            }
375            archive_origin = reader.stream_position()?;
376            let format_value = read_format_value(&mut reader); // 3-byte edition: value = major*256
377            format_major = format_value >> 8;
378            // The global compression char follows the version string (same as
379            // format 8+). Formats <= 7 carry no per-entry compression byte, so
380            // this single char governs both the catalogue and every entry's data.
381            let global_comp = read_u8(&mut reader).unwrap_or(b'n');
382            let cat_offset = read_terminateur(&mut reader)?;
383            let cat_start = archive_origin
384                .checked_add(cat_offset)
385                .ok_or_else(|| DarError::Corrupt("catalogue offset overflows".into()))?;
386            let end = reader.seek(SeekFrom::End(0))?;
387            if cat_start >= end {
388                return Err(DarError::Corrupt(format!(
389                    "catalogue start {cat_start} past archive end {end}"
390                )));
391            }
392            reader.seek(SeekFrom::Start(cat_start))?;
393            // Legacy catalogue: no 10-byte label, no path — entries begin here.
394            // When the archive is compressed, the catalogue is a single codec
395            // stream (the terminateur addresses its start); inflate it first.
396            if is_compressed(global_comp) {
397                let mut compressed = Vec::new();
398                reader
399                    .by_ref()
400                    .take(MAX_CATALOGUE_COMPRESSED)
401                    .read_to_end(&mut compressed)?;
402                let inflated = inflate_catalogue(&compressed, global_comp, compr_bs)?;
403                (entries, complete) =
404                    parse_catalog(&mut Cursor::new(inflated), format_major, global_comp)?;
405            } else {
406                (entries, complete) = parse_catalog(&mut reader, format_major, global_comp)?;
407            }
408        } else {
409            return Err(DarError::Corrupt(format!(
410                "unknown slice-header extension {extension:#04x}"
411            )));
412        }
413
414        Ok(Self {
415            inner: reader,
416            archive_origin,
417            format_major,
418            complete,
419            compr_bs,
420            entries,
421        })
422    }
423
424    /// Number of catalogue entries, in O(1) — without materialising or cloning
425    /// the entry list (cheap even for a multi-hundred-thousand-entry archive).
426    #[must_use]
427    pub fn entry_count(&self) -> usize {
428        self.entries.len()
429    }
430
431    /// Iterate the catalogue entries lazily, cloning one [`DarEntry`] at a time
432    /// rather than allocating the whole `Vec` up front — for streaming over a
433    /// large archive (hashing, timelining, filtering) without holding every
434    /// entry in memory at once. Use [`entries`](Self::entries) when you want them
435    /// all collected.
436    pub fn iter_entries(&self) -> impl Iterator<Item = DarEntry> + '_ {
437        self.entries.iter().map(EntryRef::to_dar_entry)
438    }
439
440    /// List all archived file entries (path and uncompressed size).
441    pub fn entries(&self) -> Vec<DarEntry> {
442        self.iter_entries().collect()
443    }
444
445    /// Whether the catalog was parsed to a clean end.
446    ///
447    /// `false` means parsing stopped early — typically at a catalog entry type
448    /// this reader does not model (e.g. a hardlink or device node) or at
449    /// corruption — so [`entries`](Self::entries) may be an *incomplete* listing.
450    /// A forensic caller should treat an incomplete listing as "more may exist".
451    #[must_use]
452    pub fn is_complete(&self) -> bool {
453        self.complete
454    }
455
456    /// Verify a file entry's data against the CRC stored in the catalogue,
457    /// decompressing the entry as needed. Returns [`CrcStatus::Match`],
458    /// [`CrcStatus::Mismatch`], or [`CrcStatus::NotStored`]. Unlike a
459    /// verify-on-extract design, this never refuses to hand over the bytes —
460    /// a forensic caller can still [`extract`](Self::extract) data that fails
461    /// its CRC in order to examine the corruption.
462    pub fn verify<P: AsRef<[u8]>>(&mut self, path: P) -> Result<CrcStatus, DarError> {
463        let path = path.as_ref();
464        let stored = self
465            .entries
466            .iter()
467            .find(|e| e.path.as_slice() == path)
468            .ok_or_else(|| DarError::EntryNotFound(String::from_utf8_lossy(path).into_owned()))?
469            .crc
470            .clone();
471        let Some(stored) = stored else {
472            return Ok(CrcStatus::NotStored);
473        };
474        // The CRC covers the plaintext, so verify against the decompressed data.
475        let data = self.extract(path)?;
476        let computed = dar_crc(&data, stored.len());
477        if computed == stored {
478            Ok(CrcStatus::Match)
479        } else {
480            Ok(CrcStatus::Mismatch {
481                stored: to_hex(&stored),
482                computed: to_hex(&computed),
483            })
484        }
485    }
486
487    /// Extract a file by path, streaming its (decompressed) bytes to `out` and
488    /// returning the number of bytes written. Unlike [`extract`](Self::extract),
489    /// this never holds the whole file in memory, so it is safe for multi-GiB
490    /// entries (and composes with hashing, scanning, or writing to disk).
491    pub fn extract_to<P: AsRef<[u8]>, W: Write>(
492        &mut self,
493        path: P,
494        out: &mut W,
495    ) -> Result<u64, DarError> {
496        let path = path.as_ref();
497        let name = String::from_utf8_lossy(path);
498        let entry = self
499            .entries
500            .iter()
501            .find(|e| e.path.as_slice() == path)
502            .ok_or_else(|| DarError::EntryNotFound(name.clone().into_owned()))?
503            .clone();
504
505        // The raw bytes live at archive_origin + archive_offset. Both fields are
506        // attacker-controlled, so the sum is checked and the claimed length
507        // validated against the bytes that actually exist before reading.
508        let start = self
509            .archive_origin
510            .checked_add(entry.archive_offset)
511            .ok_or_else(|| {
512                DarError::Corrupt(format!("'{name}' archive offset overflows file position"))
513            })?;
514        let end = self.inner.seek(SeekFrom::End(0))?;
515        if start > end {
516            return Err(DarError::Corrupt(format!(
517                "'{name}' starts at {start}, past archive end {end}"
518            )));
519        }
520        let available = end - start;
521        self.inner.seek(SeekFrom::Start(start))?;
522
523        // Stored: stream the raw bytes straight through, no buffering.
524        if !is_compressed(entry.compression) {
525            if entry.stored_size > available {
526                return Err(DarError::Corrupt(format!(
527                    "'{name}' claims {} stored bytes but only {available} remain",
528                    entry.stored_size
529                )));
530            }
531            return Ok(std::io::copy(
532                &mut self.inner.by_ref().take(entry.stored_size),
533                out,
534            )?);
535        }
536
537        // Compressed: decode straight to `out`, capped at the declared size so a
538        // forged stream cannot over-inflate (streaming decompression-bomb guard).
539        let mut cap = CapWriter {
540            inner: out,
541            written: 0,
542            max: entry.size,
543        };
544        if self.format_major == 1 {
545            // Format 1 stores no storage_size; the codec stream (dar 1.x is
546            // gzip/zlib-only) runs from the offset to its own natural end.
547            decode_stream(self.inner.by_ref(), entry.compression, &mut cap)?;
548        } else {
549            // 8+/2-7: exactly stored_size compressed bytes on disk.
550            if entry.stored_size > available {
551                return Err(DarError::Corrupt(format!(
552                    "'{name}' claims {} stored bytes but only {available} remain",
553                    entry.stored_size
554                )));
555            }
556            let mut data = vec![0u8; entry.stored_size as usize];
557            self.inner.read_exact(&mut data)?;
558            decode_data(&data[..], entry.compression, self.compr_bs, &mut cap)?;
559        }
560        if cap.written != entry.size {
561            return Err(DarError::Corrupt(format!(
562                "'{name}' decompressed to {} bytes but catalog declares {}",
563                cap.written, entry.size
564            )));
565        }
566        Ok(cap.written)
567    }
568
569    /// Extract a file by path, returning its raw bytes. Buffers the whole entry
570    /// in memory; prefer [`extract_to`](Self::extract_to) for large files.
571    pub fn extract<P: AsRef<[u8]>>(&mut self, path: P) -> Result<Vec<u8>, DarError> {
572        let mut buf = Vec::new();
573        self.extract_to(path, &mut buf)?;
574        Ok(buf)
575    }
576}
577
578// ── Catalog parser ────────────────────────────────────────────────────────────
579
580/// On archives larger than this, the catalog scan starts this many bytes
581/// before EOF (the catalog always lives at the tail), avoiding a full read of
582/// a multi-gigabyte forensic archive before falling back to a full scan.
583const TAIL_SCAN: u64 = 256 * 1024 * 1024;
584
585const CHUNK: usize = 4 * 1024 * 1024;
586// OVERLAP = max(SEQT_CATALOGUE.len(), label.len()) - 1; carries bytes across chunk boundaries.
587const OVERLAP: usize = 9;
588
589/// Scan forward from the current reader position searching for either the
590/// `seqt_catalogue` escape or the archive `label`.
591///
592/// Returns `Some(true)` if the escape was found (reader positioned just after it),
593/// `Some(false)` if the label was found (reader positioned just after it),
594/// `None` if EOF was reached without a match.
595fn scan_window<R: Read + Seek>(
596    r: &mut R,
597    label: &[u8; 10],
598    use_label: bool,
599) -> Result<Option<bool>, DarError> {
600    let mut buf = vec![0u8; CHUNK + OVERLAP];
601    let mut overlap_len: usize = 0;
602    loop {
603        let chunk_file_pos = r.stream_position()?;
604        let n = r.read(&mut buf[overlap_len..overlap_len + CHUNK])?;
605        if n == 0 {
606            break;
607        }
608        let total = overlap_len + n;
609        // buf[0..overlap_len]  → tail of previous chunk (file pos: chunk_file_pos - overlap_len)
610        // buf[overlap_len..total] → newly read bytes
611        let buf_base = chunk_file_pos - overlap_len as u64;
612
613        if let Some(i) = buf[..total]
614            .windows(SEQT_CATALOGUE.len())
615            .position(|w| w == SEQT_CATALOGUE)
616        {
617            r.seek(SeekFrom::Start(
618                buf_base + i as u64 + SEQT_CATALOGUE.len() as u64,
619            ))?;
620            return Ok(Some(true));
621        }
622        if use_label {
623            if let Some(i) = buf[..total]
624                .windows(label.len())
625                .position(|w| w == label.as_ref())
626            {
627                r.seek(SeekFrom::Start(buf_base + i as u64 + label.len() as u64))?;
628                return Ok(Some(false));
629            }
630        }
631
632        let keep = OVERLAP.min(total);
633        buf.copy_within(total - keep..total, 0);
634        overlap_len = keep;
635    }
636    Ok(None)
637}
638
639/// Locate the catalog section and position the reader at its first entry.
640///
641/// Returns `true` when the `seqt_catalogue` escape is found — the caller then
642/// skips the 10-byte in-catalog label and (format 11.1+) the path NUL string.
643/// The escape is a *sequential-read tape mark*; it is present only when the
644/// archive was written with tape marks (libdar's default).
645///
646/// Returns `false` when the catalog is located by its `ref_data_name` label
647/// directly. Archives written with tape marks disabled (e.g. by Passware Kit
648/// Mobile, equivalent to `dar -at`) omit the escape; their catalog still begins
649/// with the 10-byte `ref_data_name`, which equals the slice `label`, so scanning
650/// for `label` in the tail finds it — a structural marker, not a heuristic.
651///
652/// Returns `Err(Corrupt)` when neither marker is found.
653///
654/// Strategy: DAR catalogs always live at the tail of the archive.  On forensic
655/// archives ≥ 256 MiB we jump straight to the last 256 MiB and scan forward
656/// from there, then fall back to a full forward scan from `archive_origin` if
657/// needed.  This reduces the I/O for a 92 GiB archive from ~99 GiB to ~107 MiB.
658fn find_catalogue<R: Read + Seek>(r: &mut R, label: &[u8; 10]) -> Result<bool, DarError> {
659    find_catalogue_within(r, label, TAIL_SCAN)
660}
661
662/// Implementation of [`find_catalogue`] with the tail-scan window size as a
663/// parameter so the full-scan fallback can be exercised without a 256 MiB
664/// fixture.
665fn find_catalogue_within<R: Read + Seek>(
666    r: &mut R,
667    label: &[u8; 10],
668    tail_scan: u64,
669) -> Result<bool, DarError> {
670    // All-zero labels cannot be used as a reliable catalog marker (too common
671    // in zero-padded archive bodies).
672    let use_label = !label.iter().all(|&b| b == 0);
673
674    let archive_origin = r.stream_position()?;
675    let file_end = r.seek(SeekFrom::End(0))?;
676
677    if file_end <= archive_origin {
678        return Err(DarError::Corrupt("archive body too short".into()));
679    }
680
681    // Jump to at most tail_scan bytes before end; for small files this equals archive_origin.
682    let tail_start = archive_origin.max(file_end.saturating_sub(tail_scan));
683    r.seek(SeekFrom::Start(tail_start))?;
684
685    if let Some(result) = scan_window(r, label, use_label)? {
686        return Ok(result);
687    }
688
689    // Tail scan missed.  Fall back to a full scan from archive_origin.
690    if tail_start > archive_origin {
691        r.seek(SeekFrom::Start(archive_origin))?;
692        if let Some(result) = scan_window(r, label, use_label)? {
693            return Ok(result);
694        }
695    }
696
697    Err(DarError::Corrupt("seqt_catalogue not found".into()))
698}
699
700/// The byte length of one slice's header (`magic + label + flag + extension +
701/// optional TLV / slice-size`). Every slice of a multi-volume archive begins
702/// with this header; slice 1's header is the archive's own slice header, while
703/// later slices' headers are stripped so only their data regions join the
704/// logical stream. Mirrors the header prefix parsed by [`DarReader::open`].
705fn slice_header_len<R: Read + Seek>(r: &mut R) -> Result<u64, DarError> {
706    let mut magic = [0u8; 4];
707    r.read_exact(&mut magic).map_err(|_| DarError::NotADar)?;
708    if magic != DAR_MAGIC {
709        return Err(DarError::NotADar);
710    }
711    skip(r, 10)?; // internal_name label
712    let _flag = read_u8(r)?;
713    match read_u8(r)? {
714        b'T' => {
715            // TLV list: infinint(count) then count × (u16 type + infinint len + data).
716            let tlv_count = read_infinint(r)?;
717            for _ in 0..tlv_count {
718                skip(r, 2)?;
719                let len = read_infinint(r)?;
720                skip(r, len)?;
721            }
722        }
723        b'N' => {}
724        b'S' => {
725            read_infinint(r)?; // legacy slice-size field
726        }
727        other => {
728            return Err(DarError::Corrupt(format!(
729                "unknown slice-header extension {other:#04x}"
730            )));
731        }
732    }
733    Ok(r.stream_position()?)
734}
735
736/// One slice's contribution to the logical archive stream.
737struct SliceSpan {
738    file: File,
739    /// Byte offset within the slice file where this slice's contributed data
740    /// begins — 0 for slice 1 (its header is kept), the header length otherwise.
741    file_data_start: u64,
742    /// Where this slice begins in the logical (de-sliced) stream.
743    logical_start: u64,
744    /// Number of logical bytes this slice contributes.
745    logical_len: u64,
746}
747
748/// A `Read + Seek` view over a multi-volume DAR archive (`base.1.dar`,
749/// `base.2.dar`, …) presenting the slices as one contiguous logical stream:
750/// slice 1 in full (its header is the archive's slice header) followed by every
751/// later slice with its own slice header stripped. This is byte-identical to the
752/// equivalent unsliced archive, so the catalogue and per-entry offsets resolve
753/// across slice boundaries with no other change to the reader.
754pub struct SliceReader {
755    slices: Vec<SliceSpan>,
756    pos: u64,
757    total: u64,
758}
759
760impl SliceReader {
761    /// Build the logical stream from an explicit, ordered list of slice files
762    /// (`base.1.dar`, `base.2.dar`, …); the first path is slice 1.
763    pub fn open(paths: &[PathBuf]) -> Result<Self, DarError> {
764        if paths.is_empty() {
765            return Err(DarError::Corrupt("no slices provided".into()));
766        }
767        let mut slices = Vec::with_capacity(paths.len());
768        let mut logical_start = 0u64;
769        for (i, path) in paths.iter().enumerate() {
770            let mut file = File::open(path)?;
771            let len = file.seek(SeekFrom::End(0))?;
772            file.seek(SeekFrom::Start(0))?;
773            let file_data_start = if i == 0 {
774                0
775            } else {
776                slice_header_len(&mut file)?
777            };
778            // libdar's SAR layer ends every slice with a 1-byte flag ('N' = a slice
779            // follows, 'T' = terminal). On a non-terminal slice that flag sits in
780            // the middle of the file data and must be dropped; the terminal slice's
781            // flag is the archive's own final byte and is kept — so the logical
782            // stream ends byte-identically to an unsliced archive and the
783            // end-relative terminateur (tape-marks-off catalogues) still resolves.
784            let trailer = u64::from(i + 1 < paths.len());
785            if len < file_data_start + trailer {
786                return Err(DarError::Corrupt(
787                    "slice smaller than its header + flag".into(),
788                ));
789            }
790            let logical_len = len - file_data_start - trailer;
791            slices.push(SliceSpan {
792                file,
793                file_data_start,
794                logical_start,
795                logical_len,
796            });
797            logical_start = logical_start
798                .checked_add(logical_len)
799                .ok_or_else(|| DarError::Corrupt("total slice length overflows".into()))?;
800        }
801        Ok(Self {
802            slices,
803            pos: 0,
804            total: logical_start,
805        })
806    }
807}
808
809impl Read for SliceReader {
810    fn read(&mut self, buf: &mut [u8]) -> std::io::Result<usize> {
811        // Fill `buf` across slice boundaries — only stopping short at end-of-archive
812        // or an underlying short read — so callers that issue a single `read()` and
813        // assume a full buffer (as they may for an in-memory `Cursor`) behave
814        // identically over a sliced archive.
815        let mut written = 0;
816        while written < buf.len() {
817            let pos = self.pos;
818            // The first slice whose data extends past `pos` contains it (slices are
819            // contiguous from 0); no such slice means end-of-archive.
820            let Some(idx) = self
821                .slices
822                .iter()
823                .position(|s| pos < s.logical_start + s.logical_len)
824            else {
825                break;
826            };
827            let n = {
828                let span = &mut self.slices[idx];
829                let within = pos - span.logical_start;
830                let want = (buf.len() - written).min((span.logical_len - within) as usize);
831                span.file
832                    .seek(SeekFrom::Start(span.file_data_start + within))?;
833                span.file.read(&mut buf[written..written + want])?
834            };
835            if n == 0 {
836                break; // truncated slice: stop, do not spin
837            }
838            self.pos += n as u64;
839            written += n;
840        }
841        Ok(written)
842    }
843}
844
845impl Seek for SliceReader {
846    fn seek(&mut self, from: SeekFrom) -> std::io::Result<u64> {
847        let target: i128 = match from {
848            SeekFrom::Start(n) => i128::from(n),
849            SeekFrom::End(n) => i128::from(self.total) + i128::from(n),
850            SeekFrom::Current(n) => i128::from(self.pos) + i128::from(n),
851        };
852        if target < 0 {
853            return Err(std::io::Error::new(
854                std::io::ErrorKind::InvalidInput,
855                "seek before start of archive",
856            ));
857        }
858        self.pos = target as u64;
859        Ok(self.pos)
860    }
861}
862
863impl DarReader<SliceReader> {
864    /// Open a multi-volume (sliced) archive from its basename: `base` resolves
865    /// `base.1.dar`, `base.2.dar`, … until a slice is missing. The catalogue
866    /// lives in the last slice and entry data may span slices — both are handled
867    /// transparently. Errors if no `base.1.dar` exists.
868    pub fn open_slices(basename: &Path) -> Result<Self, DarError> {
869        let parent = basename
870            .parent()
871            .filter(|p| !p.as_os_str().is_empty())
872            .unwrap_or_else(|| Path::new("."));
873        let stem = basename
874            .file_name()
875            .and_then(|s| s.to_str())
876            .ok_or_else(|| DarError::Corrupt("invalid slice basename".into()))?;
877        let mut paths = Vec::new();
878        let mut n = 1u64;
879        loop {
880            let p = parent.join(format!("{stem}.{n}.dar"));
881            if !p.exists() {
882                break;
883            }
884            paths.push(p);
885            n += 1;
886        }
887        if paths.is_empty() {
888            return Err(DarError::Corrupt(format!(
889                "no slices found for basename {}",
890                basename.display()
891            )));
892        }
893        DarReader::open(SliceReader::open(&paths)?)
894    }
895}
896
897/// Read the NUL-terminated `version_string` at the current position and return
898/// `archive_version::value()` = `major*256 + fix`, where `major = b0*256 + b1`
899/// and each byte is `value + 48`. Format <= 7 stores only `"NN"` (fix implicitly
900/// 0); format 8+ stores `"NNf"`. Returns `u32::MAX` for an unreadable string so
901/// an unknown future format is treated as newest.
902fn read_format_value<R: Read>(r: &mut R) -> u32 {
903    let b = read_nul_bytes(r).unwrap_or_default();
904    if b.len() >= 2 {
905        let major = u32::from(b[0].saturating_sub(48)) * 256 + u32::from(b[1].saturating_sub(48));
906        let fix = if b.len() >= 3 {
907            u32::from(b[2].saturating_sub(48))
908        } else {
909            0
910        };
911        major * 256 + fix
912    } else {
913        u32::MAX
914    }
915}
916
917/// Read the multi-byte header flag field (libdar header_flags.cpp): bytes are
918/// accumulated most-significant-first, the low bit (`0x01`) of each byte signals
919/// that another byte follows, and the value bits are `byte & 0xFE`.
920fn read_header_flags<R: Read>(r: &mut R) -> Result<u64, DarError> {
921    let mut bits: u64 = 0;
922    loop {
923        let a = read_u8(r)?;
924        if bits >> 56 != 0 {
925            return Err(DarError::Corrupt("header flag field too large".into()));
926        }
927        bits = (bits << 8) | u64::from(a & 0xFE);
928        if a & 0x01 == 0 {
929            return Ok(bits);
930        }
931    }
932}
933
934/// Read the compression block size from the archive header (cursor positioned
935/// just after the global compression byte). A non-zero result selects dar's
936/// per-block decompression; 0 means a single codec stream.
937///
938/// Returns 0 for edition 1 (no flags), when no block size is recorded, when the
939/// value is implausibly large ([`MAX_BLOCK_SIZE`]), or when the header carries
940/// fields this reader does not parse (encryption / KDF / isolated-catalogue
941/// slicing — none of which are decodable anyway). Best-effort: a read error also
942/// degrades to 0, so a genuinely block-framed stream then fails loudly at the
943/// decode step rather than being silently mis-read. Existing single-stream
944/// archives are unaffected — they have no block size and resolve to 0.
945fn read_compr_bs<R: Read>(r: &mut R, format_major: u32) -> u64 {
946    fn inner<R: Read>(r: &mut R, format_major: u32) -> Result<u64, DarError> {
947        const INITIAL_OFFSET: u64 = 0x08;
948        const HAS_COMPRESS_BS: u64 = 0x0800;
949        // Fields sitting between the flags and the block size that this reader
950        // does not parse; archives that set them (encrypted / KDF / isolated
951        // catalogue) are not decodable regardless.
952        const COMPLEX: u64 = 0x20 | 0x04 | 0x02 | 0x0400; // scrambled | crypted-key | ref-slicing | kdf
953
954        skip_nul_string(r)?; // command line
955        if format_major < 2 {
956            return Ok(0); // the flag field was introduced at edition 2
957        }
958        let flags = read_header_flags(r)?;
959        if flags & COMPLEX != 0 || flags & HAS_COMPRESS_BS == 0 {
960            return Ok(0);
961        }
962        if flags & INITIAL_OFFSET != 0 {
963            read_infinint(r)?; // skip the initial offset
964        }
965        let bs = read_infinint(r)?;
966        Ok(if bs > MAX_BLOCK_SIZE { 0 } else { bs })
967    }
968    inner(r, format_major).unwrap_or(0)
969}
970
971/// True when a libdar compression char names a known compression algorithm.
972/// `compression2char` emits the algorithm letter in lowercase for streamed mode
973/// and uppercase for per-block mode (`z`=gzip, `y`=bzip2, `x`=xz, `l`/`j`/`k`=lzo
974/// variants, `d`=zstd, `q`=lz4); `n` is stored. Any other byte — e.g. a header
975/// placeholder in a non-dar-produced archive — is treated as not compressed, so
976/// the catalogue/entry is read verbatim rather than mis-decoded.
977fn is_compressed(algo: u8) -> bool {
978    matches!(
979        algo.to_ascii_lowercase(),
980        b'z' | b'y' | b'x' | b'l' | b'j' | b'k' | b'd' | b'q'
981    )
982}
983
984/// Inflate a compressed catalogue into a single buffer, routing through the same
985/// [`decode_stream`]/[`CapWriter`] path the per-file extractor uses and capping
986/// output at `MAX_CATALOGUE_INFLATED` (decompression-bomb guard). Trailing bytes
987/// after the codec stream (the archive trailer) are ignored by the decoder.
988fn inflate_catalogue(compressed: &[u8], algo: u8, block_size: u64) -> Result<Vec<u8>, DarError> {
989    let mut out = Vec::new();
990    let mut cap = CapWriter {
991        inner: &mut out,
992        written: 0,
993        max: MAX_CATALOGUE_INFLATED,
994    };
995    decode_data(compressed, algo, block_size, &mut cap)?;
996    Ok(out)
997}
998
999/// Decode a compressed data span. The archive uses dar's per-block framing (see
1000/// [`decode_blocks`]) when a block size is recorded (`block_size > 0`) or the
1001/// codec is lz4/lzo — which have no streamed form and so are always block-framed
1002/// (dar applies a default block size that it does not store in the header).
1003/// Otherwise it is a single codec stream (see [`decode_stream`]).
1004fn decode_data<W: Write>(
1005    data: &[u8],
1006    algo: u8,
1007    block_size: u64,
1008    out: &mut W,
1009) -> Result<(), DarError> {
1010    let always_block = matches!(algo.to_ascii_lowercase(), b'q' | b'l' | b'j' | b'k');
1011    if block_size > 0 || always_block {
1012        decode_blocks(data, algo, block_size, out)
1013    } else {
1014        decode_stream(data, algo, out)
1015    }
1016}
1017
1018/// Decode a dar `block_compressor` stream: a sequence of blocks, each
1019/// `[type: 1 byte][infinint compressed_size][compressed_size bytes]`, terminated
1020/// by an `H_EOF` block (size 0). Each `H_DATA` block is decompressed
1021/// independently and appended to `out` (libdar block_compressor.cpp /
1022/// compress_block_header.cpp).
1023///
1024/// For lz4 each block is a raw LZ4 block decoded into a `block_size`-byte buffer;
1025/// for the other codecs each block is a complete, self-delimiting codec stream
1026/// decoded via [`decode_stream`]. `block_size` is the archive's uncompressed
1027/// block size (the lz4 destination capacity). Each block's compressed size is
1028/// bounded by the remaining input, which also bounds the loop to O(input)
1029/// iterations.
1030fn decode_blocks<W: Write>(
1031    data: &[u8],
1032    algo: u8,
1033    block_size: u64,
1034    out: &mut W,
1035) -> Result<(), DarError> {
1036    const H_DATA: u8 = 1;
1037    const H_EOF: u8 = 2;
1038
1039    let mut input = data;
1040    // Reusable destination buffer for the raw block codecs (lz4, lzo): their
1041    // blocks carry no uncompressed size, so each decodes into a buffer seeded to
1042    // the declared block size, or to cover dar's default (240 KiB) when the
1043    // archive records none — a block that overflows it is genuine corruption,
1044    // surfaced as a decode error rather than silently grown.
1045    let mut raw_block_buf: Vec<u8> =
1046        if matches!(algo.to_ascii_lowercase(), b'q' | b'l' | b'j' | b'k') {
1047            let seed = if block_size > 0 {
1048                block_size.min(MAX_BLOCK_SIZE) as usize
1049            } else {
1050                256 * 1024
1051            };
1052            vec![0u8; seed]
1053        } else {
1054            Vec::new()
1055        };
1056
1057    loop {
1058        let typ = read_u8(&mut input)
1059            .map_err(|_| DarError::Corrupt("truncated block stream: missing end marker".into()))?;
1060        let size = read_infinint(&mut input)?;
1061        match typ {
1062            H_EOF => {
1063                if size != 0 {
1064                    return Err(DarError::Corrupt(
1065                        "non-zero size on end-of-blocks marker".into(),
1066                    ));
1067                }
1068                return Ok(());
1069            }
1070            H_DATA => {
1071                if size == 0 {
1072                    return Err(DarError::Corrupt("zero-size compressed block".into()));
1073                }
1074                // A block cannot be larger than the bytes that remain in the
1075                // (already bounded) input. This both caps the allocation and,
1076                // since every block consumes at least its `size` bytes, bounds
1077                // the loop to O(input) iterations — no separate block-count cap.
1078                if size > input.len() as u64 {
1079                    return Err(DarError::Corrupt(
1080                        "compressed block size exceeds remaining input".into(),
1081                    ));
1082                }
1083                let mut block = vec![0u8; size as usize];
1084                input
1085                    .read_exact(&mut block)
1086                    .map_err(|_| DarError::Corrupt("truncated compressed block".into()))?;
1087                match algo.to_ascii_lowercase() {
1088                    b'q' => decode_lz4_block(&block, &mut raw_block_buf, out)?,
1089                    b'l' | b'j' | b'k' => decode_lzo_block(&block, &mut raw_block_buf, out)?,
1090                    // gzip/bzip2/xz/zstd block = a complete self-delimiting stream.
1091                    _ => decode_stream(&block[..], algo, out)?,
1092                }
1093            }
1094            other => {
1095                return Err(DarError::Corrupt(format!(
1096                    "unknown compressed block type {other}"
1097                )));
1098            }
1099        }
1100    }
1101}
1102
1103/// Decompress one raw lz4 block into `out` using `buf` (sized to the block size)
1104/// as the destination. A block that does not fit (or is malformed) is a decode
1105/// error — dar never writes a block larger than the archive's block size.
1106fn decode_lz4_block<W: Write>(block: &[u8], buf: &mut [u8], out: &mut W) -> Result<(), DarError> {
1107    let n = lz4_flex::block::decompress_into(block, buf)
1108        .map_err(|e| DarError::Corrupt(format!("lz4 block decode failed: {e}")))?;
1109    out.write_all(&buf[..n])?;
1110    Ok(())
1111}
1112
1113/// Decompress one raw lzo1x block into `out` using `buf` (sized to the block
1114/// size) as the destination. A block that does not fit, or is not a valid lzo1x
1115/// block, is a decode error — dar never writes a block larger than the archive's
1116/// block size, and the [`lzo`] decoder is bounds-checked, so malformed input
1117/// surfaces as a typed error rather than a panic.
1118fn decode_lzo_block<W: Write>(block: &[u8], buf: &mut [u8], out: &mut W) -> Result<(), DarError> {
1119    let n = lzo::decompress_into(block, buf)
1120        .map_err(|e| DarError::Corrupt(format!("lzo block decode failed: {e}")))?;
1121    out.write_all(&buf[..n])?;
1122    Ok(())
1123}
1124
1125/// A `Write` adapter that forwards to `inner`, counting bytes written and failing
1126/// once more than `max` would be written — the streaming decompression-bomb
1127/// guard used by [`DarReader::extract_to`].
1128struct CapWriter<'a, W: Write> {
1129    inner: &'a mut W,
1130    written: u64,
1131    max: u64,
1132}
1133
1134impl<W: Write> Write for CapWriter<'_, W> {
1135    fn write(&mut self, data: &[u8]) -> std::io::Result<usize> {
1136        if self.written + data.len() as u64 > self.max {
1137            return Err(std::io::Error::other("decompressed data exceeds bound"));
1138        }
1139        self.inner.write_all(data)?;
1140        self.written += data.len() as u64;
1141        Ok(data.len())
1142    }
1143
1144    fn flush(&mut self) -> std::io::Result<()> {
1145        self.inner.flush()
1146    }
1147}
1148
1149/// Stream-decode a compressed input to `out`, dispatching on the libdar codec
1150/// char. The Read decoders stop at the codec stream's end (ignoring trailing
1151/// bytes); lzma-rs rejects trailing bytes only after fully validating the
1152/// stream, so that one error is treated as success.
1153fn decode_stream<R: Read, W: Write>(input: R, algo: u8, out: &mut W) -> Result<(), DarError> {
1154    match algo.to_ascii_lowercase() {
1155        b'z' => {
1156            std::io::copy(&mut flate2::read::ZlibDecoder::new(input), out)
1157                .map_err(|e| DarError::Corrupt(format!("zlib decode failed: {e}")))?;
1158            Ok(())
1159        }
1160        b'y' => {
1161            std::io::copy(&mut bzip2_rs::DecoderReader::new(input), out)
1162                .map_err(|e| DarError::Corrupt(format!("bzip2 decode failed: {e}")))?;
1163            Ok(())
1164        }
1165        b'x' => {
1166            let mut br = std::io::BufReader::new(input);
1167            match lzma_rs::xz_decompress(&mut br, out) {
1168                Ok(()) => {}
1169                Err(lzma_rs::error::Error::XzError(ref m))
1170                    if m == "Unexpected data after last XZ block" => {}
1171                Err(e) => return Err(DarError::Corrupt(format!("xz decode failed: {e}"))),
1172            }
1173            Ok(())
1174        }
1175        b'd' => {
1176            // dar's streamed zstd is a standard zstd frame (ZSTD_compressStream).
1177            let mut dec = ruzstd::StreamingDecoder::new(input)
1178                .map_err(|e| DarError::Corrupt(format!("zstd decode failed: {e}")))?;
1179            std::io::copy(&mut dec, out)
1180                .map_err(|e| DarError::Corrupt(format!("zstd decode failed: {e}")))?;
1181            Ok(())
1182        }
1183        // An unrecognised codec char lands here — a clear error, never a silent
1184        // misread. (Single line so the e2e-coverage allowlist matches one specific line.)
1185        #[rustfmt::skip]
1186        other => Err(DarError::Corrupt(format!("unrecognised compression codec '{}'", other as char))),
1187    }
1188}
1189
1190/// Locate the catalogue in a pre-format-8 archive via the end `terminateur`
1191/// trailer (libdar terminateur.cpp:95-138), returning the catalogue start offset
1192/// relative to `archive_origin`.
1193///
1194/// From EOF, count trailing `0xFF` padding bytes (8 bits each); the first
1195/// non-`0xFF` byte encodes the remaining count in unary as its set high bits.
1196/// `byte_offset = total_bits * 4` is the distance back from that byte to the
1197/// catalogue-position infinint. The `0xFF` run is bounded so a hostile all-`0xFF`
1198/// tail cannot spin or overflow.
1199fn read_terminateur<R: Read + Seek>(r: &mut R) -> Result<u64, DarError> {
1200    const BLOCK_SIZE: u64 = 4;
1201    const MAX_BITS: u64 = 4096; // far beyond any real terminator
1202
1203    let mut pos = r.seek(SeekFrom::End(0))?;
1204    let mut bits: u64 = 0;
1205    let terminal = loop {
1206        if pos == 0 {
1207            return Err(DarError::Corrupt("terminator underflows archive".into()));
1208        }
1209        pos -= 1;
1210        r.seek(SeekFrom::Start(pos))?;
1211        let b = read_u8(r)?;
1212        if b == 0xFF {
1213            bits += 8;
1214            if bits > MAX_BITS {
1215                return Err(DarError::Corrupt("terminator padding too long".into()));
1216            }
1217        } else {
1218            break b;
1219        }
1220    };
1221    // The terminator byte must have its top bit set; count consecutive set MSBs.
1222    if terminal & 0x80 == 0 {
1223        return Err(DarError::Corrupt(format!(
1224            "invalid terminator byte {terminal:#04x}"
1225        )));
1226    }
1227    let mut x = terminal;
1228    while x != 0 {
1229        if x & 0x80 == 0 {
1230            return Err(DarError::Corrupt("malformed terminator bit run".into()));
1231        }
1232        bits += 1;
1233        x <<= 1;
1234    }
1235    let byte_offset = bits * BLOCK_SIZE;
1236    let infinint_start = pos
1237        .checked_sub(byte_offset)
1238        .ok_or_else(|| DarError::Corrupt("terminator offset underflows".into()))?;
1239    r.seek(SeekFrom::Start(infinint_start))?;
1240    read_infinint(r)
1241}
1242
1243/// Parse all catalog entries, returning file entries with their extraction info.
1244///
1245/// Stops when the root directory is closed (depth reaches zero) or an unknown
1246/// entry type is encountered (slice trailer).
1247fn parse_catalog<R: Read + Seek>(
1248    r: &mut R,
1249    format_major: u32,
1250    global_comp: u8,
1251) -> Result<(Vec<EntryRef>, bool), DarError> {
1252    let mut entries = Vec::new();
1253    let mut dir_stack: Vec<Vec<u8>> = Vec::new();
1254    let mut depth: u32 = 0;
1255    // True once the catalog is walked to its closing root EOD; left false if we
1256    // stop early (unknown entry type or a truncated stream).
1257    let mut complete = false;
1258
1259    loop {
1260        let mut buf = [0u8; 1];
1261        match r.read_exact(&mut buf) {
1262            Ok(()) => {}
1263            Err(_) => break,
1264        }
1265
1266        // Lower 5 bits of cat_sig + 0x60 gives the ASCII type letter.
1267        let entry_type = ((buf[0] & 0x1f) | 0x60) as char;
1268
1269        match entry_type {
1270            'z' => {
1271                // End of directory
1272                depth = depth.saturating_sub(1);
1273                dir_stack.pop();
1274                if depth == 0 {
1275                    complete = true; // reached the closing root EOD — clean end
1276                    break;
1277                }
1278            }
1279            'd' => {
1280                let name = read_nul_bytes(r)?;
1281                let inode = read_inode_base(r, format_major)?;
1282                if format_major >= 9 && (inode.flags >> 4) & 1 != 0 {
1283                    skip_fsa(r)?;
1284                }
1285                let is_root = depth == 0;
1286                depth += 1;
1287                // The archive root (`<ROOT>`, or `"root"` in formats 1/9) is a
1288                // virtual node: `<ROOT>` is dropped entirely; a named root becomes
1289                // the path prefix. Neither is listed as an entry. Real
1290                // sub-directories are listed with their full path.
1291                if name != b"<ROOT>" {
1292                    let path = join_path(&dir_stack, &name);
1293                    if !is_root {
1294                        entries.push(meta_entry(path, EntryKind::Directory, &inode, None));
1295                    }
1296                    dir_stack.push(name);
1297                }
1298            }
1299            'f' => {
1300                let name = read_nul_bytes(r)?;
1301                let inode = read_inode_base(r, format_major)?;
1302                if format_major >= 9 && (inode.flags >> 4) & 1 != 0 {
1303                    skip_fsa(r)?;
1304                }
1305
1306                let FileFields {
1307                    size,
1308                    archive_offset,
1309                    stored_size,
1310                    compression,
1311                    crc,
1312                } = read_file_fields(r, format_major, global_comp)?;
1313
1314                entries.push(EntryRef {
1315                    path: join_path(&dir_stack, &name),
1316                    kind: EntryKind::File,
1317                    size,
1318                    uid: inode.uid,
1319                    gid: inode.gid,
1320                    mode: inode.mode,
1321                    atime: inode.atime,
1322                    mtime: inode.mtime,
1323                    ctime: inode.ctime,
1324                    symlink_target: None,
1325                    archive_offset,
1326                    stored_size,
1327                    compression,
1328                    crc,
1329                });
1330            }
1331            'l' => {
1332                // Symbolic link: inode + NUL-terminated target path.
1333                let name = read_nul_bytes(r)?;
1334                let inode = read_inode_base(r, format_major)?;
1335                if format_major >= 9 && (inode.flags >> 4) & 1 != 0 {
1336                    skip_fsa(r)?;
1337                }
1338                let target = read_nul_bytes(r)?;
1339                let path = join_path(&dir_stack, &name);
1340                entries.push(meta_entry(path, EntryKind::Symlink, &inode, Some(target)));
1341            }
1342            'p' | 's' => {
1343                // Named pipe (FIFO) / unix socket: a bare inode, no data and no
1344                // type-specific fields.
1345                let name = read_nul_bytes(r)?;
1346                let inode = read_inode_base(r, format_major)?;
1347                if format_major >= 9 && (inode.flags >> 4) & 1 != 0 {
1348                    skip_fsa(r)?;
1349                }
1350                let kind = if entry_type == 'p' {
1351                    EntryKind::NamedPipe
1352                } else {
1353                    EntryKind::Socket
1354                };
1355                entries.push(meta_entry(join_path(&dir_stack, &name), kind, &inode, None));
1356            }
1357            _ => break, // unknown type = slice trailer or unhandled entry
1358        }
1359    }
1360
1361    Ok((entries, complete))
1362}
1363
1364/// The file-specific catalog fields that follow a file inode.
1365struct FileFields {
1366    size: u64,
1367    archive_offset: u64,
1368    stored_size: u64,
1369    compression: u8,
1370    crc: Option<Vec<u8>>,
1371}
1372
1373/// Read the file-specific catalog fields after the inode. Layout differs by
1374/// format (libdar cat_file.cpp / crc.cpp):
1375/// - 8+: storage_size · file_data_status(1) · comp(1) · length-prefixed CRC.
1376/// - 2-7: storage_size · fixed 2-byte CRC; no status/comp byte — the
1377///   archive-global codec applies.
1378/// - 1: size · offset only; storage_size synthesised, global codec applies.
1379fn read_file_fields<R: Read + Seek>(
1380    r: &mut R,
1381    format_major: u32,
1382    global_comp: u8,
1383) -> Result<FileFields, DarError> {
1384    let size = read_infinint(r)?;
1385    let archive_offset = read_infinint(r)?;
1386    let (mut stored_size, compression, crc) = if format_major >= 8 {
1387        let ss = read_infinint(r)?;
1388        let _file_data_status = read_u8(r)?;
1389        let comp = read_u8(r)?;
1390        let crc = read_crc(r)?; // infinint width + that many raw bytes
1391        (ss, comp, crc)
1392    } else if format_major >= 2 {
1393        let ss = read_infinint(r)?;
1394        let mut crcbuf = [0u8; 2]; // legacy: fixed 2-byte CRC, no width prefix
1395        r.read_exact(&mut crcbuf)?;
1396        (ss, global_comp, Some(crcbuf.to_vec()))
1397    } else {
1398        (size, global_comp, None) // format 1: storage_size synthesised, no CRC
1399    };
1400    // Pre-8: storage_size 0 means the data is stored uncompressed.
1401    if format_major <= 7 && stored_size == 0 {
1402        stored_size = size;
1403    }
1404    Ok(FileFields {
1405        size,
1406        archive_offset,
1407        stored_size,
1408        compression,
1409        crc,
1410    })
1411}
1412
1413/// Read a format-8+ length-prefixed CRC: an infinint width then that many raw
1414/// bytes. A zero width (abnormal — libdar uses >= 1) yields `None`; a width past
1415/// [`MAX_CRC_SIZE`] is rejected as corrupt (allocation-bomb guard).
1416fn read_crc<R: Read>(r: &mut R) -> Result<Option<Vec<u8>>, DarError> {
1417    let crc_size = read_infinint(r)?;
1418    if crc_size == 0 {
1419        return Ok(None);
1420    }
1421    if crc_size > MAX_CRC_SIZE {
1422        return Err(DarError::Corrupt(format!(
1423            "CRC width {crc_size} exceeds {MAX_CRC_SIZE}-byte bound"
1424        )));
1425    }
1426    let mut buf = vec![0u8; crc_size as usize];
1427    r.read_exact(&mut buf)?;
1428    Ok(Some(buf))
1429}
1430
1431/// libdar's per-file CRC: an XOR-fold of `data` into a `width`-byte accumulator,
1432/// byte `i` into slot `i mod width` (zero-init, read out slot 0 first; no final
1433/// transform). `width` must be non-zero (a zero-width CRC is treated as absent).
1434fn dar_crc(data: &[u8], width: usize) -> Vec<u8> {
1435    let mut acc = vec![0u8; width];
1436    for (i, &b) in data.iter().enumerate() {
1437        acc[i % width] ^= b;
1438    }
1439    acc
1440}
1441
1442/// Lowercase hex encoding of `bytes`.
1443fn to_hex(bytes: &[u8]) -> String {
1444    const HEX: [u8; 16] = *b"0123456789abcdef";
1445    let mut s = String::with_capacity(bytes.len() * 2);
1446    for &b in bytes {
1447        // Each nibble is masked to 0..16, so the table index can never be out of
1448        // bounds — panic-free without `unwrap`.
1449        s.push(HEX[(b >> 4) as usize] as char);
1450        s.push(HEX[(b & 0xf) as usize] as char);
1451    }
1452    s
1453}
1454
1455/// Join a directory stack and a leaf name into a `/`-separated raw-byte path.
1456fn join_path(stack: &[Vec<u8>], name: &[u8]) -> Vec<u8> {
1457    let mut path = Vec::new();
1458    for component in stack {
1459        path.extend_from_slice(component);
1460        path.push(b'/');
1461    }
1462    path.extend_from_slice(name);
1463    path
1464}
1465
1466/// Build an `EntryRef` for a non-file inode (dir/symlink/pipe/socket): it carries
1467/// metadata but no archive data.
1468fn meta_entry(
1469    path: Vec<u8>,
1470    kind: EntryKind,
1471    inode: &Inode,
1472    symlink_target: Option<Vec<u8>>,
1473) -> EntryRef {
1474    EntryRef {
1475        path,
1476        kind,
1477        size: 0,
1478        uid: inode.uid,
1479        gid: inode.gid,
1480        mode: inode.mode,
1481        atime: inode.atime,
1482        mtime: inode.mtime,
1483        ctime: inode.ctime,
1484        symlink_target,
1485        archive_offset: 0,
1486        stored_size: 0,
1487        compression: b'n',
1488        crc: None,
1489    }
1490}
1491
1492// ── Low-level I/O helpers ─────────────────────────────────────────────────────
1493
1494/// Read a DAR variable-length infinint, decoded to `u64`.
1495///
1496/// Format (TG=4): optional leading `0x00` skip-bytes, then a terminal byte
1497/// with exactly one bit set; `pos = terminal.leading_zeros()` and the value
1498/// occupies `(skip_count * 8 + pos + 1) * 4` big-endian bytes.
1499///
1500/// A `u64` holds at most 8 data bytes.  Any encoding wider than that — i.e.
1501/// *any* leading `0x00` (which alone implies ≥ 36 bytes) or a terminal below
1502/// `0x40` (`pos > 1`) — cannot be represented and is rejected as `Corrupt`
1503/// rather than silently truncated.  This single bound also removes the
1504/// `(skip * 8 …)` arithmetic-overflow panic and caps the leading-zero scan, so
1505/// a malicious all-zero run can never spin or overflow the skip counter.
1506fn read_infinint<R: Read>(r: &mut R) -> Result<u64, DarError> {
1507    let terminal = read_u8(r)?;
1508    if terminal == 0x00 {
1509        // A skip-byte group is at least 36 data bytes — far beyond u64.
1510        return Err(DarError::Corrupt(
1511            "infinint exceeds 64-bit range (multi-group encoding)".into(),
1512        ));
1513    }
1514    if terminal.count_ones() != 1 {
1515        return Err(DarError::Corrupt(format!(
1516            "invalid infinint terminal: {terminal:#04x}"
1517        )));
1518    }
1519    let pos = terminal.leading_zeros(); // 0 ..= 7
1520    if pos > 1 {
1521        // data_bytes = (pos + 1) * 4 > 8 → does not fit in u64.
1522        return Err(DarError::Corrupt(format!(
1523            "infinint exceeds 64-bit range: terminal {terminal:#04x} implies {} bytes",
1524            (pos + 1) * 4
1525        )));
1526    }
1527    let data_bytes = (pos + 1) * 4; // 4 (terminal 0x80) or 8 (terminal 0x40)
1528    let mut val: u64 = 0;
1529    for _ in 0..data_bytes {
1530        val = (val << 8) | u64::from(read_u8(r)?);
1531    }
1532    Ok(val)
1533}
1534
1535fn read_u8<R: Read>(r: &mut R) -> Result<u8, DarError> {
1536    let mut b = [0u8; 1];
1537    r.read_exact(&mut b)?;
1538    Ok(b[0])
1539}
1540
1541/// Upper bound on a NUL-terminated path/name field.  Real DAR entries stay
1542/// well under this; the cap stops a NUL-free region of a hostile archive from
1543/// growing the buffer until EOF (or OOM on a multi-GiB stream).
1544const MAX_NUL_STRING: usize = 64 * 1024;
1545
1546/// Read a NUL-terminated byte string (raw, not UTF-8 validated), consuming the
1547/// NUL. Length-capped at `MAX_NUL_STRING` so a NUL-free hostile region can't grow
1548/// the buffer to EOF.
1549fn read_nul_bytes<R: Read>(r: &mut R) -> Result<Vec<u8>, DarError> {
1550    let mut bytes = Vec::new();
1551    loop {
1552        let b = read_u8(r)?;
1553        if b == 0 {
1554            break;
1555        }
1556        if bytes.len() >= MAX_NUL_STRING {
1557            return Err(DarError::Corrupt(format!(
1558                "NUL-terminated string exceeds {MAX_NUL_STRING} bytes"
1559            )));
1560        }
1561        bytes.push(b);
1562    }
1563    Ok(bytes)
1564}
1565
1566/// Skip a NUL-terminated string without collecting the bytes.
1567fn skip_nul_string<R: Read>(r: &mut R) -> Result<(), DarError> {
1568    let mut len: usize = 0;
1569    loop {
1570        if read_u8(r)? == 0 {
1571            return Ok(());
1572        }
1573        len += 1;
1574        if len > MAX_NUL_STRING {
1575            return Err(DarError::Corrupt(format!(
1576                "NUL-terminated string exceeds {MAX_NUL_STRING} bytes"
1577            )));
1578        }
1579    }
1580}
1581
1582/// Seek past `n` bytes.
1583fn skip<R: Seek>(r: &mut R, n: u64) -> Result<(), DarError> {
1584    if n > 0 {
1585        // `SeekFrom::Current` takes an i64; a value above i64::MAX would cast to
1586        // a negative offset and seek *backwards* (re-reading earlier bytes on a
1587        // File).  No real DAR field is that large — reject it outright.
1588        let off = i64::try_from(n)
1589            .map_err(|_| DarError::Corrupt(format!("skip length {n} exceeds seekable range")))?;
1590        r.seek(SeekFrom::Current(off)).map_err(DarError::Io)?;
1591    }
1592    Ok(())
1593}
1594
1595/// Skip one DAR timestamp field.
1596///
1597/// Timestamps are prefixed with a type byte:
1598/// - `'s'` (0x73) and others: seconds only — one infinint follows
1599/// - `'n'` (0x6e): nanosecond precision — two infinints follow (seconds + nanoseconds)
1600fn read_timestamp<R: Read + Seek>(r: &mut R, format_major: u32) -> Result<i64, DarError> {
1601    // Format 8 and earlier store a bare seconds infinint with NO precision byte
1602    // (libdar datetime.cpp:372). Format 9+ prefix a unit byte ('s' seconds,
1603    // 'u' microsecond, 'n' nanosecond); sub-second units add a second infinint,
1604    // which we read and discard (seconds resolution is what we expose).
1605    if format_major < 9 {
1606        return Ok(read_infinint(r)? as i64);
1607    }
1608    let ts_type = read_u8(r)?;
1609    let secs = read_infinint(r)? as i64;
1610    if ts_type == b'n' || ts_type == b'u' {
1611        read_infinint(r)?;
1612    }
1613    Ok(secs)
1614}
1615
1616/// Read a 2-byte big-endian `u16` (uid/gid for format <= 7, and permission bits).
1617fn read_u16<R: Read>(r: &mut R) -> Result<u16, DarError> {
1618    let mut b = [0u8; 2];
1619    r.read_exact(&mut b)?;
1620    Ok(u16::from_be_bytes(b))
1621}
1622
1623/// Decoded inode metadata shared by every catalog entry type.
1624struct Inode {
1625    flags: u8,
1626    uid: u64,
1627    gid: u64,
1628    mode: u16,
1629    atime: i64,
1630    mtime: i64,
1631    ctime: Option<i64>,
1632}
1633
1634/// Read one inode's base fields and return them. Layout in order: an optional
1635/// flags byte (format 2+), uid, gid, a `u16` perms field, atime, mtime, and a
1636/// ctime for format 8+. uid/gid are a 2-byte `u16` for format `<= 7` and an
1637/// infinint for 8+; each timestamp is decoded by [`read_timestamp`]. FSA inode
1638/// fields (format 9+, when flag bit `0x10` is set) are consumed and discarded.
1639fn read_inode_base<R: Read + Seek>(r: &mut R, format_major: u32) -> Result<Inode, DarError> {
1640    // Format 1 predates extended attributes and has NO leading flag byte
1641    // (libdar cat_inode.cpp); formats 2+ store it. Synthesise 0 for format 1.
1642    let flags = if format_major >= 2 { read_u8(r)? } else { 0 };
1643    // uid/gid: 2-byte u16 for format <= 7 (libdar cat_inode.cpp:171), infinint for 8+.
1644    let (uid, gid) = if format_major <= 7 {
1645        (u64::from(read_u16(r)?), u64::from(read_u16(r)?))
1646    } else {
1647        (read_infinint(r)?, read_infinint(r)?)
1648    };
1649    let mode = read_u16(r)?; // perms: a 2-byte big-endian u16, never an infinint
1650    let atime = read_timestamp(r, format_major)?;
1651    let mtime = read_timestamp(r, format_major)?;
1652    // ctime (last_cha) exists only from format 8 (libdar cat_inode.cpp:197).
1653    let ctime = if format_major >= 8 {
1654        Some(read_timestamp(r, format_major)?)
1655    } else {
1656        None
1657    };
1658    // FSA inode fields exist only from format 9 (libdar cat_inode.cpp:264); bit
1659    // 0x10 is the FSA-full status. Formats <= 8 have no FSA.
1660    if format_major >= 9 && (flags >> 4) & 1 != 0 {
1661        read_infinint(r)?;
1662        read_infinint(r)?;
1663    }
1664    Ok(Inode {
1665        flags,
1666        uid,
1667        gid,
1668        mode,
1669        atime,
1670        mtime,
1671        ctime,
1672    })
1673}
1674
1675/// Skip one FSA (filesystem attributes) block.
1676///
1677/// Format: infinint(family_tag) + infinint(data_size) + data_size bytes.
1678fn skip_fsa<R: Read + Seek>(r: &mut R) -> Result<(), DarError> {
1679    let _tag = read_infinint(r)?;
1680    let size = read_infinint(r)?;
1681    skip(r, size)
1682}
1683
1684// ── Unit tests ────────────────────────────────────────────────────────────────
1685
1686#[cfg(test)]
1687mod tests {
1688    use super::*;
1689    use std::io::Cursor;
1690
1691    // ── SliceReader truncated-slice guard ─────────────────────────────────────
1692
1693    #[test]
1694    fn slicereader_stops_on_truncated_slice() {
1695        use std::io::Read;
1696        // A span claiming more bytes than its file holds (only constructible
1697        // internally — `open` always measures the real file). Reading must stop at
1698        // the real EOF instead of spinning on the missing tail.
1699        let path = std::env::temp_dir().join(format!("dar_ms_trunc_{}.bin", std::process::id()));
1700        std::fs::write(&path, [1u8, 2, 3, 4]).unwrap();
1701        let mut sr = SliceReader {
1702            slices: vec![SliceSpan {
1703                file: File::open(&path).unwrap(),
1704                file_data_start: 0,
1705                logical_start: 0,
1706                logical_len: 100, // lies: only 4 bytes exist
1707            }],
1708            pos: 0,
1709            total: 100,
1710        };
1711        let mut buf = [0u8; 50];
1712        assert_eq!(sr.read(&mut buf).unwrap(), 4);
1713        assert_eq!(&buf[..4], &[1, 2, 3, 4]);
1714        let _ = std::fs::remove_file(&path);
1715    }
1716
1717    // ── read_infinint ─────────────────────────────────────────────────────────
1718
1719    #[test]
1720    fn infinint_decodes_value() {
1721        let data = [0x80u8, 0x00, 0x00, 0x00, 0x0d];
1722        assert_eq!(read_infinint(&mut Cursor::new(&data[..])).unwrap(), 13);
1723    }
1724
1725    #[test]
1726    fn infinint_bad_preamble_returns_corrupt() {
1727        // 0x03 = two bits set — not a valid infinint terminal.
1728        let data = [0x03u8, 0x00, 0x00, 0x00, 0x00];
1729        let err = read_infinint(&mut Cursor::new(&data[..])).unwrap_err();
1730        assert!(matches!(&err, DarError::Corrupt(_)));
1731    }
1732
1733    #[test]
1734    fn infinint_truncated_returns_io() {
1735        // Only 2 bytes — read_exact needs 5.
1736        let err = read_infinint(&mut Cursor::new(&[0x80u8, 0x00][..])).unwrap_err();
1737        assert!(matches!(err, DarError::Io(_)));
1738    }
1739
1740    #[test]
1741    fn infinint_0x40_preamble_reads_8_data_bytes() {
1742        // 0x40 terminal: leading_zeros=1, pos=1, data_bytes=(0*8+1+1)*4=8
1743        // Encodes the value 0x5d15_9331 in 8 big-endian bytes.
1744        let mut data = vec![0x40u8];
1745        data.extend_from_slice(&[0x00, 0x00, 0x00, 0x00, 0x5d, 0x15, 0x93, 0x31]);
1746        assert_eq!(
1747            read_infinint(&mut Cursor::new(data)).unwrap(),
1748            0x5d15_9331u64
1749        );
1750    }
1751
1752    #[test]
1753    fn infinint_multi_bit_terminal_returns_corrupt() {
1754        // 0x60 = 0110_0000 — two bits set, not a valid terminal.
1755        let data = [0x60u8, 0x00, 0x00, 0x00, 0x00];
1756        let err = read_infinint(&mut Cursor::new(&data[..])).unwrap_err();
1757        assert!(matches!(&err, DarError::Corrupt(_)));
1758    }
1759
1760    // ── read_u8 ───────────────────────────────────────────────────────────────
1761
1762    #[test]
1763    fn read_u8_reads_single_byte() {
1764        assert_eq!(read_u8(&mut Cursor::new(&[0x42u8][..])).unwrap(), 0x42);
1765    }
1766
1767    #[test]
1768    fn read_u8_eof_returns_io() {
1769        let err = read_u8(&mut Cursor::new(&[][..])).unwrap_err();
1770        assert!(matches!(err, DarError::Io(_)));
1771    }
1772
1773    // ── read_nul_bytes ──────────────────────────────────────────────────────
1774
1775    #[test]
1776    fn nul_bytes_reads_until_nul() {
1777        let data = b"hello\x00world";
1778        assert_eq!(
1779            read_nul_bytes(&mut Cursor::new(&data[..])).unwrap(),
1780            b"hello"
1781        );
1782    }
1783
1784    #[test]
1785    fn nul_bytes_preserves_non_utf8() {
1786        // Raw bytes are kept verbatim — a non-UTF-8 name must NOT be rejected.
1787        let data = [0xFF, 0x80, 0x00];
1788        assert_eq!(
1789            read_nul_bytes(&mut Cursor::new(&data[..])).unwrap(),
1790            vec![0xFF, 0x80]
1791        );
1792    }
1793
1794    #[test]
1795    fn nul_bytes_eof_before_nul_returns_io() {
1796        let err = read_nul_bytes(&mut Cursor::new(b"no-nul".to_vec())).unwrap_err();
1797        assert!(matches!(err, DarError::Io(_)));
1798    }
1799
1800    // ── skip_nul_string ───────────────────────────────────────────────────────
1801
1802    #[test]
1803    fn skip_nul_string_advances_past_nul() {
1804        let data = b"skip\x00rest";
1805        let mut c = Cursor::new(data.to_vec());
1806        skip_nul_string(&mut c).unwrap();
1807        assert_eq!(c.position(), 5); // "skip\0" = 5 bytes consumed
1808    }
1809
1810    #[test]
1811    fn skip_nul_string_eof_returns_io() {
1812        let err = skip_nul_string(&mut Cursor::new(b"no-nul".to_vec())).unwrap_err();
1813        assert!(matches!(err, DarError::Io(_)));
1814    }
1815
1816    // ── find_catalogue ────────────────────────────────────────────────────────
1817
1818    #[test]
1819    fn find_catalogue_body_too_short() {
1820        // Fewer than 6 bytes — can't fill the initial window; label also too short.
1821        let label = [0u8; 10];
1822        let err = find_catalogue(&mut Cursor::new(&[0x01u8, 0x02, 0x03][..]), &label).unwrap_err();
1823        assert!(
1824            matches!(&err, DarError::Corrupt(s) if s == "archive body too short"
1825            || s == "seqt_catalogue not found")
1826        );
1827    }
1828
1829    #[test]
1830    fn find_catalogue_escape_at_start() {
1831        let mut data = [0xAD, 0xFD, 0xEA, 0x77, 0x21, 0x43, 0xFF];
1832        let mut c = Cursor::new(&mut data[..]);
1833        let via_escape = find_catalogue(&mut c, &[0u8; 10]).unwrap();
1834        assert!(via_escape);
1835        assert_eq!(c.position(), 6);
1836    }
1837
1838    #[test]
1839    fn find_catalogue_escape_not_found() {
1840        // 10 bytes of zeros, label is 0xFF×10 so label scan also fails.
1841        let label = [0xFFu8; 10];
1842        let err = find_catalogue(&mut Cursor::new(&[0u8; 10][..]), &label).unwrap_err();
1843        assert!(matches!(&err, DarError::Corrupt(s) if s == "seqt_catalogue not found"));
1844    }
1845
1846    #[test]
1847    fn find_catalogue_label_fallback() {
1848        let label: [u8; 10] = [0xA1, 0xB2, 0xC3, 0xD4, 0xE5, 0xF6, 0x07, 0x18, 0x29, 0x3A];
1849        // Prefix junk (no escape) followed by the label bytes.
1850        let mut data = vec![0x00u8; 5];
1851        data.extend_from_slice(&label);
1852        let mut c = Cursor::new(data);
1853        let via_escape = find_catalogue(&mut c, &label).unwrap();
1854        assert!(!via_escape);
1855        assert_eq!(c.position(), 15); // 5 junk + 10 label consumed
1856    }
1857
1858    // ── skip ──────────────────────────────────────────────────────────────────
1859
1860    #[test]
1861    fn skip_zero_does_not_move_cursor() {
1862        let mut c = Cursor::new(vec![0xFFu8; 10]);
1863        skip(&mut c, 0).unwrap();
1864        assert_eq!(c.position(), 0);
1865    }
1866
1867    #[test]
1868    fn skip_n_advances_cursor() {
1869        let mut c = Cursor::new(vec![0xFFu8; 10]);
1870        skip(&mut c, 7).unwrap();
1871        assert_eq!(c.position(), 7);
1872    }
1873
1874    // ── read_inode_base ───────────────────────────────────────────────────────
1875
1876    #[test]
1877    fn inode_base_bit4_clear_reads_31_bytes() {
1878        // flags(1) + uid(5) + gid(5) + perms(2) + 3×[type(1)+secs(5)] = 31 bytes
1879        let mut data = vec![0x00u8]; // flags (bit4=0)
1880        data.extend_from_slice(&[0x80, 0x00, 0x00, 0x00, 0x00]); // uid
1881        data.extend_from_slice(&[0x80, 0x00, 0x00, 0x00, 0x00]); // gid
1882        data.extend_from_slice(&[0x00, 0x00]); // perms
1883        for _ in 0..3 {
1884            data.push(b's'); // timestamp type
1885            data.extend_from_slice(&[0x80, 0x00, 0x00, 0x00, 0x00]); // seconds
1886        }
1887        data.push(0xFF); // sentinel — must not be consumed
1888        let mut c = Cursor::new(data);
1889        assert_eq!(read_inode_base(&mut c, 11).unwrap().flags, 0x00);
1890        assert_eq!(c.position(), 31);
1891    }
1892
1893    #[test]
1894    fn inode_base_bit4_set_reads_41_bytes() {
1895        // flags(1) + uid(5) + gid(5) + perms(2) + 3×[type(1)+secs(5)] + nlink(5) + field9(5) = 41
1896        let mut data = vec![0x10u8]; // flags (bit4=1)
1897        data.extend_from_slice(&[0x80, 0x00, 0x00, 0x00, 0x00]); // uid
1898        data.extend_from_slice(&[0x80, 0x00, 0x00, 0x00, 0x00]); // gid
1899        data.extend_from_slice(&[0x00, 0x00]); // perms
1900        for _ in 0..3 {
1901            data.push(b's');
1902            data.extend_from_slice(&[0x80, 0x00, 0x00, 0x00, 0x00]);
1903        }
1904        data.extend_from_slice(&[0x80, 0x00, 0x00, 0x00, 0x00]); // nlink
1905        data.extend_from_slice(&[0x80, 0x00, 0x00, 0x00, 0x00]); // field9
1906        data.push(0xFF); // sentinel
1907        let mut c = Cursor::new(data);
1908        assert_eq!(read_inode_base(&mut c, 11).unwrap().flags, 0x10);
1909        assert_eq!(c.position(), 41);
1910    }
1911
1912    // ── skip_fsa ─────────────────────────────────────────────────────────────
1913
1914    #[test]
1915    fn skip_fsa_consumes_tag_size_and_data() {
1916        // tag=infinint(5) + size=infinint(3) + 3 data bytes
1917        let mut data = Vec::new();
1918        data.extend_from_slice(&[0x80, 0x00, 0x00, 0x00, 0x05]); // tag
1919        data.extend_from_slice(&[0x80, 0x00, 0x00, 0x00, 0x03]); // size=3
1920        data.extend_from_slice(&[0xAA, 0xBB, 0xCC]); // data
1921        data.push(0xFF); // sentinel
1922        let mut c = Cursor::new(data);
1923        skip_fsa(&mut c).unwrap();
1924        assert_eq!(c.position(), 13); // 5 + 5 + 3 = 13
1925    }
1926
1927    // ── hardening: malicious / corrupted infinint encodings ───────────────────
1928    //
1929    // A `u64` holds at most 8 data bytes.  The reader's contract is "decode to
1930    // u64 or return Corrupt" — it must never silently truncate an over-wide
1931    // value, overflow while computing the byte count, or loop on a zero run.
1932
1933    #[test]
1934    fn infinint_leading_zero_byte_returns_corrupt() {
1935        // A leading 0x00 skip-byte implies a ≥36-byte group — far beyond u64.
1936        // Must be rejected as Corrupt, not mislabelled as an I/O shortage.
1937        let data = [0x00u8, 0x80, 0x00, 0x00, 0x00, 0x00];
1938        let err = read_infinint(&mut Cursor::new(&data[..])).unwrap_err();
1939        assert!(matches!(err, DarError::Corrupt(_)), "got {err:?}");
1940    }
1941
1942    #[test]
1943    fn infinint_12_byte_group_exceeds_u64_returns_corrupt() {
1944        // 0x20 terminal → pos=2 → 12 data bytes → cannot fit in u64.
1945        // Must error rather than silently truncate to a wrong value.
1946        let mut data = vec![0x20u8];
1947        data.extend_from_slice(&[0x11; 12]);
1948        let err = read_infinint(&mut Cursor::new(data)).unwrap_err();
1949        assert!(matches!(err, DarError::Corrupt(_)), "got {err:?}");
1950    }
1951
1952    #[test]
1953    fn infinint_all_zero_run_returns_corrupt_without_hanging() {
1954        // A run of zero bytes must terminate promptly with Corrupt, never spin
1955        // consuming the whole stream (and never overflow-panic the skip count).
1956        let data = vec![0u8; 4096];
1957        let err = read_infinint(&mut Cursor::new(data)).unwrap_err();
1958        assert!(matches!(err, DarError::Corrupt(_)), "got {err:?}");
1959    }
1960
1961    // ── hardening: unbounded NUL-terminated strings ───────────────────────────
1962
1963    #[test]
1964    fn nul_bytes_without_terminator_is_length_bounded() {
1965        // No NUL in 200 KiB of data: must be rejected once the path cap is hit,
1966        // not grow the buffer until EOF (or OOM on a multi-GiB stream).
1967        let data = vec![b'A'; 200_000];
1968        let err = read_nul_bytes(&mut Cursor::new(data)).unwrap_err();
1969        assert!(matches!(err, DarError::Corrupt(_)), "got {err:?}");
1970    }
1971
1972    #[test]
1973    fn skip_nul_string_without_terminator_is_length_bounded() {
1974        let data = vec![b'A'; 200_000];
1975        let err = skip_nul_string(&mut Cursor::new(data)).unwrap_err();
1976        assert!(matches!(err, DarError::Corrupt(_)), "got {err:?}");
1977    }
1978
1979    // ── hardening: skip must never seek backwards ─────────────────────────────
1980
1981    #[test]
1982    fn skip_value_above_i64_max_returns_corrupt() {
1983        // n > i64::MAX casts to a negative i64 → SeekFrom::Current would seek
1984        // *backwards* on a File (re-reading earlier bytes).  Must be rejected,
1985        // and the stream position must not move.
1986        let mut c = Cursor::new(vec![0u8; 64]);
1987        c.set_position(32);
1988        let err = skip(&mut c, 0x8000_0000_0000_0000).unwrap_err();
1989        assert!(matches!(err, DarError::Corrupt(_)), "got {err:?}");
1990        assert_eq!(c.position(), 32); // unchanged on a rejected skip
1991    }
1992
1993    // ── terminateur trailer (pre-8 catalog locator) ───────────────────────────
1994
1995    #[test]
1996    fn terminateur_reads_catalogue_offset() {
1997        // pos infinint 0x18 = 24; terminator 0xc0 → two leading ones → 2*4 = 8
1998        // bytes back to the infinint.
1999        let data = vec![0x80u8, 0x00, 0x00, 0x00, 0x18, 0x00, 0x00, 0x00, 0xc0];
2000        assert_eq!(read_terminateur(&mut Cursor::new(data)).unwrap(), 24);
2001    }
2002
2003    #[test]
2004    fn terminateur_all_ff_underflows_returns_corrupt() {
2005        let err = read_terminateur(&mut Cursor::new(vec![0xFFu8; 4])).unwrap_err();
2006        assert!(matches!(err, DarError::Corrupt(_)), "got {err:?}");
2007    }
2008
2009    #[test]
2010    fn terminateur_excessive_ff_padding_returns_corrupt() {
2011        let err = read_terminateur(&mut Cursor::new(vec![0xFFu8; 600])).unwrap_err();
2012        assert!(matches!(err, DarError::Corrupt(_)), "got {err:?}");
2013    }
2014
2015    #[test]
2016    fn terminateur_low_terminator_byte_returns_corrupt() {
2017        // Terminator byte 0x01 has no top bit set.
2018        let data = vec![0x80u8, 0x00, 0x00, 0x00, 0x18, 0x01];
2019        let err = read_terminateur(&mut Cursor::new(data)).unwrap_err();
2020        assert!(matches!(err, DarError::Corrupt(_)), "got {err:?}");
2021    }
2022
2023    #[test]
2024    fn terminateur_noncontiguous_high_bits_returns_corrupt() {
2025        // 0xA0 = 1010_0000: top bit set but the high-bit run is not contiguous.
2026        let data = vec![0x80u8, 0x00, 0x00, 0x00, 0x18, 0xA0];
2027        let err = read_terminateur(&mut Cursor::new(data)).unwrap_err();
2028        assert!(matches!(err, DarError::Corrupt(_)), "got {err:?}");
2029    }
2030
2031    // ── find_catalogue: full-scan fallback + body-too-short ────────────────────
2032
2033    #[test]
2034    fn find_catalogue_falls_back_to_full_scan() {
2035        // Escape near the start; a tiny tail window misses it, forcing the
2036        // archive_origin full-scan fallback.
2037        let mut data = vec![0x11u8, 0x22]; // junk before the escape
2038        data.extend_from_slice(&SEQT_CATALOGUE);
2039        data.extend_from_slice(&[0x33u8; 12]); // trailing bytes beyond the tail window
2040        let mut c = Cursor::new(data);
2041        let via_escape = find_catalogue_within(&mut c, &[0u8; 10], 4).unwrap();
2042        assert!(via_escape);
2043        assert_eq!(c.position(), 2 + SEQT_CATALOGUE.len() as u64);
2044    }
2045
2046    #[test]
2047    fn find_catalogue_full_scan_miss_returns_not_found() {
2048        // No escape and no matching label anywhere; a tiny tail window forces
2049        // the full-scan fallback, which also misses → "not found".
2050        let mut c = Cursor::new(vec![0x11u8; 16]);
2051        let err = find_catalogue_within(&mut c, &[0xABu8; 10], 4).unwrap_err();
2052        assert!(matches!(&err, DarError::Corrupt(s) if s == "seqt_catalogue not found"));
2053    }
2054
2055    #[test]
2056    fn find_catalogue_body_too_short_when_origin_at_eof() {
2057        let mut c = Cursor::new(vec![0u8; 6]);
2058        c.seek(SeekFrom::Start(6)).unwrap();
2059        let err = find_catalogue(&mut c, &[0u8; 10]).unwrap_err();
2060        assert!(matches!(&err, DarError::Corrupt(s) if s == "archive body too short"));
2061    }
2062
2063    // ── decode_stream / CapWriter ────────────────────────────────────────────
2064
2065    #[test]
2066    fn decode_stream_caps_decompression_bomb() {
2067        use flate2::{write::ZlibEncoder, Compression};
2068        use std::io::Write;
2069        let mut enc = ZlibEncoder::new(Vec::new(), Compression::default());
2070        enc.write_all(&[0u8; 4096]).unwrap();
2071        let blob = enc.finish().unwrap();
2072        // Inflates to 4096 bytes but the CapWriter caps output at 16.
2073        let mut sink = Vec::new();
2074        let mut cap = CapWriter {
2075            inner: &mut sink,
2076            written: 0,
2077            max: 16,
2078        };
2079        let err = decode_stream(&blob[..], b'z', &mut cap).unwrap_err();
2080        assert!(matches!(&err, DarError::Corrupt(s) if s.contains("exceeds bound")));
2081    }
2082
2083    #[test]
2084    fn decode_stream_rejects_malformed_zlib() {
2085        let err = decode_stream(
2086            b"not a zlib stream at all".as_slice(),
2087            b'z',
2088            &mut Vec::new(),
2089        )
2090        .unwrap_err();
2091        assert!(matches!(&err, DarError::Corrupt(s) if s.contains("zlib decode failed")));
2092    }
2093
2094    #[test]
2095    fn decode_stream_rejects_malformed_bzip2() {
2096        let err =
2097            decode_stream(b"not a bzip2 stream".as_slice(), b'y', &mut Vec::new()).unwrap_err();
2098        assert!(matches!(&err, DarError::Corrupt(s) if s.contains("bzip2 decode failed")));
2099    }
2100
2101    #[test]
2102    fn decode_stream_rejects_malformed_xz() {
2103        let err = decode_stream(
2104            b"this is not an xz stream".as_slice(),
2105            b'x',
2106            &mut Vec::new(),
2107        )
2108        .unwrap_err();
2109        assert!(matches!(&err, DarError::Corrupt(s) if s.contains("xz decode failed")));
2110    }
2111
2112    #[test]
2113    fn decode_stream_rejects_malformed_zstd() {
2114        let err = decode_stream(b"not a zstd frame".as_slice(), b'd', &mut Vec::new()).unwrap_err();
2115        assert!(matches!(&err, DarError::Corrupt(s) if s.contains("zstd decode failed")));
2116    }
2117
2118    #[test]
2119    fn decode_stream_rejects_unknown_codec() {
2120        // No streamed codec routes here in a full build; a stray byte must error.
2121        let err = decode_stream(b"data".as_slice(), b'?', &mut Vec::new()).unwrap_err();
2122        assert!(
2123            matches!(&err, DarError::Corrupt(s) if s.contains("unrecognised compression codec"))
2124        );
2125    }
2126
2127    #[test]
2128    fn header_flags_single_two_byte_and_overlong() {
2129        // Single byte (low bit clear): value is `byte & 0xFE`.
2130        assert_eq!(read_header_flags(&mut [0x10u8].as_slice()).unwrap(), 0x10);
2131        // Two bytes (first low bit set = continuation): 0x09,0x08 -> 0x0808.
2132        assert_eq!(
2133            read_header_flags(&mut [0x09u8, 0x08].as_slice()).unwrap(),
2134            0x0808
2135        );
2136        // A field that never terminates within 8 bytes is rejected.
2137        let err = read_header_flags(&mut [0xFFu8; 9].as_slice()).unwrap_err();
2138        assert!(matches!(&err, DarError::Corrupt(s) if s.contains("flag field too large")));
2139    }
2140
2141    #[test]
2142    fn compr_bs_edition_one_is_zero() {
2143        // Edition < 2 has no flag field, hence no block size.
2144        assert_eq!(read_compr_bs(&mut b"cmdline\x00rest".as_slice(), 1), 0);
2145    }
2146
2147    #[test]
2148    fn compr_bs_read_after_initial_offset() {
2149        // cmd_line "\0" | flags 0x0808 (HAS_COMPRESS_BS + INITIAL_OFFSET) |
2150        // initial_offset (skipped) | compr_bs = 42.
2151        let mut buf = vec![0x00u8]; // empty command line
2152        buf.extend_from_slice(&[0x09, 0x08]); // flags = 0x0808
2153        buf.extend_from_slice(&[0x80, 0, 0, 0, 0]); // initial_offset = 0
2154        buf.extend_from_slice(&[0x80, 0, 0, 0, 42]); // compr_bs = 42
2155        assert_eq!(read_compr_bs(&mut buf.as_slice(), 11), 42);
2156    }
2157
2158    #[test]
2159    fn cap_writer_forwards_within_bound_and_fails_over() {
2160        use std::io::Write;
2161        let mut sink = Vec::new();
2162        let mut w = CapWriter {
2163            inner: &mut sink,
2164            written: 0,
2165            max: 4,
2166        };
2167        assert_eq!(w.write(b"ab").unwrap(), 2); // within bound
2168        w.flush().unwrap();
2169        let err = w.write(b"cde").unwrap_err(); // 2 + 3 > 4
2170        assert_eq!(err.to_string(), "decompressed data exceeds bound");
2171        assert_eq!(sink, b"ab");
2172    }
2173}
dar/lib.rs

dar/
lib.rs