dwarfs/
section.rs

1//! The low-level module for accessing sections in a DwarFS archive.
2//!
3//! A DwarFS archive consists of several sections. Sections for storing raw file
4//! data are also called blocks. Each section consists of a [`Header`] and
5//! maybe-compressed section payload bytes. The maximum uncompressed payload
6//! length of each data section is the block-size, which is by default 16MiB.
7//! Other non-block sections may be smaller or larger, but should still be
8//! small enough to store in memory.
9//!
10//! See [`SectionReader`] for APIs to read sections. In general, functions
11//! returning section headers will always validate the DwarFS version marked in
12//! the header is supported, and functions returning section payloads will
13//! always validate the fast XXH3 checksum against the header before return.
14//!
15//! See also:
16//! [DwarFS File System Format v2.5](https://github.com/mhx/dwarfs/blob/66b80efd0f47209c2d85c95c8af9f078436b6554/doc/dwarfs-format.md)
17use std::{fmt, mem::offset_of};
18
19use positioned_io::ReadAt;
20use xxhash_rust::xxh3::Xxh3Default;
21use zerocopy::{FromBytes, FromZeros, Immutable, IntoBytes, KnownLayout, little_endian as le};
22
23use crate::SUPPORTED_VERSION_RANGE;
24
25type Result<T> = std::result::Result<T, Error>;
26
27/// An error raised from reading, validating, or decompressiong sections.
28pub struct Error(Box<ErrorInner>);
29
30impl fmt::Debug for Error {
31    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
32        self.0.fmt(f)
33    }
34}
35
36#[derive(Debug)]
37// Variants may never be constructed without relevant features enabled.
38#[cfg_attr(not(feature = "default"), allow(dead_code))]
39enum ErrorInner {
40    // Header.
41    InvalidMagic([u8; 6]),
42    UnsupportedVersion(u8, u8),
43    LengthMismatch,
44    ChecksumMismatch,
45    OffsetOverflow,
46
47    // Payload.
48    UnsupportedCompressAlgo(CompressAlgo),
49    TypeMismatch {
50        expect: SectionType,
51        got: SectionType,
52    },
53    PayloadTooLong {
54        limit: usize,
55        got: Option<u64>,
56    },
57    Decompress(std::io::Error),
58    MalformedSectionIndex(String),
59
60    // Other.
61    Io(std::io::Error),
62}
63
64impl fmt::Display for Error {
65    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
66        match &*self.0 {
67            ErrorInner::InvalidMagic(magic) => {
68                write!(f, "invalid section magic: b\"{}\"", magic.escape_ascii())
69            }
70            ErrorInner::UnsupportedVersion(maj, min) => {
71                write!(f, "unsupported section version: DWARFS{maj}.{min}")
72            }
73            ErrorInner::LengthMismatch => f.pad("section payload length mismatch"),
74            ErrorInner::ChecksumMismatch => f.pad("section checksum mismatch"),
75            ErrorInner::OffsetOverflow => f.pad("section offset overflow"),
76
77            ErrorInner::UnsupportedCompressAlgo(algo) => {
78                write!(f, "unsupported section compress algorithm {algo:?}")
79            }
80            ErrorInner::TypeMismatch { expect, got } => {
81                write!(
82                    f,
83                    "section type mismatch, expect {expect:?} but got {got:?}"
84                )
85            }
86            ErrorInner::PayloadTooLong {
87                limit,
88                got: Some(got),
89            } => {
90                write!(
91                    f,
92                    "section payload has {got} bytes, exceeding the limit of {limit} bytes"
93                )
94            }
95            ErrorInner::PayloadTooLong { limit, got: None } => {
96                write!(f, "section payload exceeds the limit of {limit} bytes")
97            }
98            ErrorInner::MalformedSectionIndex(msg) => {
99                write!(f, "malformed section index: {msg}")
100            }
101
102            ErrorInner::Decompress(err) => write!(f, "failed to decompress section payload: {err}"),
103
104            ErrorInner::Io(err) => err.fmt(f),
105        }
106    }
107}
108
109impl std::error::Error for Error {
110    fn source(&self) -> Option<&(dyn std::error::Error + 'static)> {
111        match &*self.0 {
112            ErrorInner::Decompress(err) | ErrorInner::Io(err) => Some(err),
113            _ => None,
114        }
115    }
116}
117
118impl From<std::io::Error> for Error {
119    #[cold]
120    fn from(err: std::io::Error) -> Self {
121        Self(Box::new(ErrorInner::Io(err)))
122    }
123}
124
125impl From<ErrorInner> for Error {
126    #[cold]
127    fn from(err: ErrorInner) -> Self {
128        Self(Box::new(err))
129    }
130}
131
132pub(crate) const HEADER_SIZE: u64 = size_of::<Header>() as u64;
133
134/// The section (aka. block) header.
135#[derive(Clone, Copy, PartialEq, Eq, Hash, FromBytes, IntoBytes, Immutable, KnownLayout)]
136#[repr(C, align(8))]
137pub struct Header {
138    /// Header magic and format version.
139    pub magic_version: MagicVersion,
140    /// The "slow" hash digests of SHA-512/256.
141    pub slow_hash: [u8; 32],
142    /// The "fast" hash digests of XXH3-64.
143    pub fast_hash: [u8; 8],
144    /// The 0-based index of this section in the DwarFS archive.
145    pub section_number: le::U32,
146    /// The type of this section.
147    pub section_type: SectionType,
148    /// The compression algorithm of the section payload.
149    pub compress_algo: CompressAlgo,
150    /// The length in bytes of the compressed payload following.
151    pub payload_size: le::U64,
152}
153
154impl fmt::Debug for Header {
155    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
156        f.debug_struct("BlockHeader")
157            .field("magic_version", &self.magic_version)
158            .field("slow_hash", &format_args!("{:02x?}", self.slow_hash))
159            .field("slow_hash", &format_args!("{:02x?}", self.fast_hash))
160            .field("section_number", &self.section_number.get())
161            .field("section_type", &self.section_type)
162            .field("compress_algo", &self.compress_algo)
163            .field("payload_size", &self.payload_size.get())
164            .finish()
165    }
166}
167
168impl Header {
169    /// Calculate section checksum of header and payload using the "fast" XXH3-64 hash.
170    ///
171    /// Note: this will also hash part of the header.
172    ///
173    /// # Errors
174    ///
175    /// Returns `Err` if the length of `payload` disagree with the header.
176    pub fn calculate_fast_checksum(&self, payload: &[u8]) -> Result<[u8; 8]> {
177        if payload.len() as u64 != self.payload_size.get() {
178            bail!(ErrorInner::LengthMismatch);
179        }
180        let mut h = Xxh3Default::new();
181        h.update(&self.as_bytes()[offset_of!(Self, section_number)..]);
182        h.update(payload);
183        Ok(h.digest().to_le_bytes())
184    }
185
186    /// Validate section checksum of header and payload using the "fast" XXH3-64 hash.
187    ///
188    /// # Errors
189    ///
190    /// Returns `Err` if the length of `payload` disagree with the header, or the
191    /// checksum mismatches.
192    pub fn validate_fast_checksum(&self, payload: &[u8]) -> Result<()> {
193        let h = self.calculate_fast_checksum(payload)?;
194        if h != self.fast_hash {
195            bail!(ErrorInner::ChecksumMismatch);
196        }
197        Ok(())
198    }
199
200    /// Calculate section checksum of header and payload using the "slow" SHA2-512/256 hash.
201    ///
202    /// Note: this will also hash part of the header, including `fast_hash` field.
203    /// If you are filling checksums in the header, you must fill `fast_hash`
204    /// before calculating `slow_hash`.
205    ///
206    /// # Errors
207    ///
208    /// Returns `Err` if the length of `payload` disagree with the header.
209    pub fn calculate_slow_checksum(&self, payload: &[u8]) -> Result<[u8; 32]> {
210        use sha2::Digest;
211
212        if payload.len() as u64 != self.payload_size.get() {
213            bail!(ErrorInner::LengthMismatch);
214        }
215        let mut h = sha2::Sha512_256::new();
216        h.update(&self.as_bytes()[offset_of!(Self, fast_hash)..]);
217        h.update(payload);
218        Ok(*h.finalize().as_ref())
219    }
220
221    /// Validate section checksum of header and payload using the "slow" SHA2-512/256 hash.
222    ///
223    /// # Errors
224    ///
225    /// Returns `Err` if the length of `payload` disagree with the header, or the
226    /// checksum mismatches.
227    pub fn validate_slow_checksum(&self, payload: &[u8]) -> Result<()> {
228        let h = self.calculate_slow_checksum(payload)?;
229        if h != self.slow_hash {
230            bail!(ErrorInner::ChecksumMismatch);
231        }
232        Ok(())
233    }
234
235    /// Update `payload_size`, `fast_hash` and `slow_hash` in header for the specific `payload`.
236    ///
237    /// The `payload` should be raw data after compression, if any is used.
238    pub fn update_size_and_checksum(&mut self, payload: &[u8]) {
239        self.payload_size = u64::try_from(payload.len())
240            .expect("payload length overflows u64")
241            .into();
242        self.fast_hash = self
243            .calculate_fast_checksum(payload)
244            .expect("length matches");
245        self.slow_hash = self
246            .calculate_slow_checksum(payload)
247            .expect("length matches");
248    }
249
250    /// Check if this section header has the expected section type.
251    pub(crate) fn check_type(&self, expect: SectionType) -> Result<()> {
252        if self.section_type != expect {
253            bail!(ErrorInner::TypeMismatch {
254                expect,
255                got: self.section_type,
256            });
257        }
258        Ok(())
259    }
260
261    fn payload_size_limited(&self, limit: usize) -> Result<usize> {
262        let size = self.payload_size.get();
263        if let Some(size) = usize::try_from(size).ok().filter(|&n| n <= limit) {
264            Ok(size)
265        } else {
266            bail!(ErrorInner::PayloadTooLong {
267                limit,
268                got: Some(size)
269            })
270        }
271    }
272}
273
274/// Section magic and format version.
275#[derive(Clone, Copy, PartialEq, Eq, Hash, FromBytes, IntoBytes, Immutable, KnownLayout)]
276#[repr(C)]
277pub struct MagicVersion {
278    /// The section magic that should match `DWARFS` ([`MagicVersion::MAGIC`]).
279    pub magic: [u8; 6],
280    /// The format major version.
281    pub major: u8,
282    /// The format minor version.
283    pub minor: u8,
284}
285
286impl fmt::Debug for MagicVersion {
287    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
288        f.debug_struct("MagicVersion")
289            .field("magic", &format_args!("b\"{}\"", self.magic.escape_ascii()))
290            .field("major", &self.major)
291            .field("minor", &self.minor)
292            .finish()
293    }
294}
295
296impl MagicVersion {
297    /// The expected magic.
298    pub const MAGIC: [u8; 6] = *b"DWARFS";
299
300    /// The magic and latest supported version.
301    pub const LATEST: Self = Self {
302        magic: Self::MAGIC,
303        major: SUPPORTED_VERSION_RANGE.end().0,
304        minor: SUPPORTED_VERSION_RANGE.end().1,
305    };
306
307    /// Validate if the magic and version is supported by this library.
308    ///
309    /// # Errors
310    ///
311    /// Returns `Err` if the magic does not match [`MAGIC`](Self::MAGIC), or the
312    /// specified DwarFS version is outside [`SUPPORTED_VERSION_RANGE`].
313    pub fn validate(self) -> Result<()> {
314        let ver = (self.major, self.minor);
315        if self.magic != Self::MAGIC {
316            bail!(ErrorInner::InvalidMagic(self.magic));
317        }
318        if !SUPPORTED_VERSION_RANGE.contains(&ver) {
319            bail!(ErrorInner::UnsupportedVersion(ver.0, ver.1));
320        }
321        Ok(())
322    }
323}
324
325/// The type of a section.
326#[derive(Clone, Copy, PartialEq, Eq, Hash, FromBytes, IntoBytes, Immutable, KnownLayout)]
327#[repr(C, align(2))]
328pub struct SectionType(pub le::U16);
329
330macro_rules! impl_open_enum {
331    ($name:ident; $ctor:path; $($(#[$meta:meta])* $variant:ident = $value:expr,)*) => {
332        impl std::fmt::Debug for $name {
333            fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
334                f.pad(match *self {
335                    $(Self::$variant => stringify!($variant),)*
336                    _ => return f
337                        .debug_tuple(stringify!($name))
338                        .field(&self.0.get())
339                        .finish(),
340                })
341            }
342        }
343
344        impl $name {
345            $(
346                $(#[$meta])*
347                pub const $variant: Self = Self($ctor($value));
348            )*
349
350            /// Return `true` if this value is known by the library.
351            #[must_use]
352            #[inline]
353            pub fn is_known(self) -> bool {
354                matches!(self, $(Self::$variant)|*)
355            }
356        }
357    };
358}
359
360impl_open_enum! {
361    SectionType; le::U16::new;
362
363    /// A block of data.
364    BLOCK = 0,
365    /// The schema used to layout on-disk format of Metadata, see [`crate::metadata::Schema`].
366    METADATA_V2_SCHEMA = 7,
367    /// The bulk of the root metadata, see [`crate::metadata::Metadata`].
368    METADATA_V2 = 8,
369    /// The index of all sections. This must be the last section if present.
370    /// It must not be compressed.
371    SECTION_INDEX = 9,
372    /// File system history information.
373    HISTORY = 10,
374}
375
376/// Compression algorithm used for section payloads.
377#[derive(Clone, Copy, PartialEq, Eq, Hash, FromBytes, IntoBytes, Immutable, KnownLayout)]
378#[repr(C, align(2))]
379pub struct CompressAlgo(pub le::U16);
380
381impl_open_enum! {
382    CompressAlgo; le::U16::new;
383
384    /// Not compressed.
385    NONE = 0,
386    /// LZMA, aka `.xz` compression. Supported via feature `lzma`.
387    LZMA = 1,
388    /// Zstd compression. Supported via feature `zstd`.
389    ZSTD = 2,
390    /// LZ4 compression. Supported via feature `lz4`.
391    LZ4 = 3,
392    /// LZ4 compression in HC (high-compression) mode. It can be decompressed as normal LZ4.
393    /// Supported via feature `lz4`.
394    LZ4HC = 4,
395    /// Brotli compression. Supported via feature `brotli`.
396    BROTLI = 5,
397    /// FLAC compression. Not supported.
398    FLAC = 6,
399    /// Rice++ compression. Not supported.
400    RICEPP = 7,
401}
402
403/// An entry in the section index.
404#[derive(Clone, Copy, PartialEq, Eq, Hash, FromBytes, IntoBytes, Immutable, KnownLayout)]
405#[repr(C, align(8))]
406pub struct SectionIndexEntry(pub le::U64);
407
408impl fmt::Debug for SectionIndexEntry {
409    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
410        f.debug_struct("SectionIndexEntry")
411            .field("section_type", &self.section_type())
412            .field("offset", &self.offset())
413            .finish()
414    }
415}
416
417impl SectionIndexEntry {
418    /// Create a section index entry with given section type and offset.
419    ///
420    /// # Errors
421    ///
422    /// If `offset` exceeds 48bits, `None` will be returned.
423    #[must_use]
424    #[inline]
425    pub fn new(typ: SectionType, offset: u64) -> Option<Self> {
426        if offset < 1u64 << 48 {
427            Some(Self((u64::from(typ.0.get()) << 48 | offset).into()))
428        } else {
429            None
430        }
431    }
432
433    /// The type of the section this entry is referring to.
434    #[must_use]
435    #[inline]
436    #[allow(clippy::missing_panics_doc, reason = "never panics")]
437    pub fn section_type(self) -> SectionType {
438        SectionType((self.0 >> 48).try_into().expect("always in u16 range"))
439    }
440
441    /// The offset of the section this entry is referring to,
442    /// relative to the first section.
443    #[must_use]
444    #[inline]
445    pub fn offset(self) -> u64 {
446        self.0.get() & ((1u64 << 48) - 1)
447    }
448}
449
450/// The wrapper type for reading sections from a random access reader.
451///
452/// The inner type should implement [`positioned_io::ReadAt`] to support
453/// efficient random access. Typically, [`std::fs::File`] should be used.
454/// You do NOT need additional buffering.
455///
456/// Note: It's *discouraged* to use [`positioned_io::RandomAccessFile`] on *NIX
457/// platforms because that would disable readahead which can hurt performance on
458/// sequential read inside a several MiB section.
459/// On Windows, however, `RandomAccessFile` is several times faster than `File`.
460pub struct SectionReader<R: ?Sized> {
461    /// The offset of the start of the DwarFS archive in `rdr`, which is added to all
462    /// operation offsets.
463    archive_start: u64,
464    /// The temporary buffer for raw compressed section payload.
465    /// It is stored only for allocation reuse. This struct is still state-less.
466    raw_buf: Vec<u8>,
467    rdr: R,
468}
469
470impl<R: fmt::Debug + ?Sized> fmt::Debug for SectionReader<R> {
471    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
472        f.debug_struct("SectionReader")
473            .field("archive_start", &self.archive_start)
474            .field(
475                "raw_buf",
476                &format_args!("{}/{}", self.raw_buf.len(), self.raw_buf.capacity()),
477            )
478            .field("rdr", &&self.rdr)
479            .finish()
480    }
481}
482
483impl<R> SectionReader<R> {
484    /// Create a new section reader wrapping an existing random access stream,
485    /// typically, [`std::fs::File`].
486    ///
487    /// You should NOT use [`BufReader`][std::io::BufReader] because sections
488    /// are large enough and high-level abstractions like
489    /// [`Archive`][crate::Archive] already has internal caching.
490    pub fn new(rdr: R) -> Self {
491        Self::new_with_offset(rdr, 0)
492    }
493
494    /// Same as [`Self::new`] but indicates the DwarFS archive is located at
495    /// `archive_start` in `rdr` instead of the start. This is also known as
496    /// `image_offset`.
497    ///
498    /// All read methods of [`SectionReader`] will add `archive_start` to its
499    /// parameter for the real file offset if necessary.
500    pub fn new_with_offset(rdr: R, archive_start: u64) -> Self {
501        SectionReader {
502            archive_start,
503            raw_buf: Vec::new(),
504            rdr,
505        }
506    }
507}
508
509impl<R: ?Sized> SectionReader<R> {
510    /// Get a reference to the underlying reader.
511    #[inline]
512    #[must_use]
513    pub fn get_ref(&self) -> &R {
514        &self.rdr
515    }
516
517    /// Get a mutable reference to the underlying reader.
518    #[inline]
519    #[must_use]
520    pub fn get_mut(&mut self) -> &mut R {
521        &mut self.rdr
522    }
523
524    /// Retrieve the ownership of the underlying reader.
525    #[inline]
526    #[must_use]
527    pub fn into_inner(self) -> R
528    where
529        R: Sized,
530    {
531        self.rdr
532    }
533}
534
535impl<R: ReadAt + ?Sized> SectionReader<R> {
536    /// Get the `archive_start` set on creation.
537    #[inline]
538    #[must_use]
539    pub fn archive_start(&self) -> u64 {
540        self.archive_start
541    }
542
543    /// Read and decompress a full section at `offset` into memory.
544    ///
545    /// This is a shortcut to call [`read_header_at`][Self::read_header_at] and
546    /// [`read_payload_at`][Self::read_payload_at].
547    ///
548    /// # Errors
549    ///
550    /// See `read_header_at` and `read_payload_at`.
551    pub fn read_section_at(
552        &mut self,
553        section_offset: u64,
554        payload_size_limit: usize,
555    ) -> Result<(Header, Vec<u8>)> {
556        let header = self.read_header_at(section_offset)?;
557        // The header is read successfully, so the offset after the header will not overflow.
558        let payload_offset = section_offset + HEADER_SIZE;
559        let payload = self.read_payload_at(&header, payload_offset, payload_size_limit)?;
560        Ok((header, payload))
561    }
562
563    /// Read a section header at `section_offset`.
564    ///
565    /// # Errors
566    ///
567    /// Returns `Err` if section offset overflows, the underlying read operation
568    /// fails, header magic is invalid or header DwarFS version is unsupported.
569    pub fn read_header_at(&mut self, section_offset: u64) -> Result<Header> {
570        let file_offset = self
571            .archive_start
572            .checked_add(section_offset)
573            .ok_or(ErrorInner::OffsetOverflow)?;
574        let mut header = Header::new_zeroed();
575        // For overflowing case, the read must fail because `HEADER_SIZE >= 2`.
576        self.rdr.read_exact_at(file_offset, header.as_mut_bytes())?;
577        header.magic_version.validate()?;
578        Ok(header)
579    }
580
581    /// Read and decompress section payload of given header into a owned `Vec<u8>`.
582    ///
583    /// Same as [`read_payload_at_into`][Self::read_payload_at_into] but returns
584    /// an `Vec<u8>` for convenience.
585    ///
586    /// # Errors
587    ///
588    /// See `read_payload_at_into`.
589    pub fn read_payload_at(
590        &mut self,
591        header: &Header,
592        payload_offset: u64,
593        payload_size_limit: usize,
594    ) -> Result<Vec<u8>> {
595        let mut out = vec![0u8; payload_size_limit];
596        let len = self.read_payload_at_into(header, payload_offset, &mut out)?;
597        out.truncate(len);
598        Ok(out)
599    }
600
601    /// Read and decompress section payload of given header into a buffer.
602    ///
603    /// `payload_offset` is the offset of the body of a section (after the header),
604    /// from the start of archive. Both compressed and decompressed size must
605    /// be within the `out.len()`, or an error will be emitted.
606    ///
607    /// # Errors
608    ///
609    /// Returns `Err` if either:
610    /// - Payload offset overflows
611    /// - Payload size exceeds the limit.
612    /// - The underlying read operation fails.
613    /// - Fast checksum (XXH3-64) of payload disagrees with the header.
614    /// - Decompression fails. This includes decompressed size exceeding the limit.
615    pub fn read_payload_at_into(
616        &mut self,
617        header: &Header,
618        payload_offset: u64,
619        out: &mut [u8],
620    ) -> Result<usize> {
621        let file_offset = self
622            .archive_start
623            .checked_add(payload_offset)
624            .ok_or(ErrorInner::OffsetOverflow)?;
625
626        let size_limit = out.len();
627        let compressed_size = header.payload_size_limited(size_limit)?;
628        let raw_buf = &mut self.raw_buf;
629        raw_buf.resize(compressed_size, 0);
630        self.rdr.read_exact_at(file_offset, raw_buf)?;
631        header.validate_fast_checksum(raw_buf)?;
632
633        match header.compress_algo {
634            CompressAlgo::NONE => {
635                out[..compressed_size].copy_from_slice(raw_buf);
636                Ok(compressed_size)
637            }
638            #[cfg(feature = "zstd")]
639            CompressAlgo::ZSTD => zstd_safe::decompress(out, raw_buf).map_err(|code| {
640                let msg = zstd_safe::get_error_name(code);
641                ErrorInner::Decompress(std::io::Error::new(std::io::ErrorKind::InvalidData, msg))
642                    .into()
643            }),
644            #[cfg(feature = "lzma")]
645            #[expect(
646                clippy::cast_possible_truncation,
647                reason = "will not overflow usize because all data is in memory"
648            )]
649            CompressAlgo::LZMA => (|| {
650                let mut stream = liblzma::stream::Stream::new_stream_decoder(u64::MAX, 0)?;
651                let st = stream.process(raw_buf, out, liblzma::stream::Action::Run)?;
652                if stream.total_in() as usize != raw_buf.len()
653                    || st != liblzma::stream::Status::StreamEnd
654                {
655                    bail!(std::io::Error::new(
656                        std::io::ErrorKind::InvalidData,
657                        "LZMA stream did not end cleanly",
658                    ));
659                }
660                Ok(stream.total_out() as usize)
661            })()
662            .map_err(|err| ErrorInner::Decompress(err).into()),
663            #[cfg(feature = "lz4")]
664            CompressAlgo::LZ4 | CompressAlgo::LZ4HC => {
665                let len = lz4::block::decompress_to_buffer(raw_buf, None, out)
666                    .map_err(ErrorInner::Decompress)?;
667                Ok(len)
668            }
669            // Not supported: FLAC (overlay specific), RICEPP (no much information or library).
670            algo => Err(ErrorInner::UnsupportedCompressAlgo(algo).into()),
671        }
672    }
673
674    /// Construct the section index by traversing all sections.
675    ///
676    /// This will traverse sections one-by-one from `archive_start` to the end
677    /// of stream. All headers will be parsed and validated, but their payloads
678    /// will not.
679    ///
680    /// Note: This may be very costly for large archives or on HDDs because it
681    /// does too many seeks on the disk.
682    ///
683    /// # Errors
684    ///
685    /// Return `Err` if fails to parse or validate section headers (see
686    /// [`SectionReader::read_header_at`]), or if section offset exceeds 48bits,
687    /// which is not representable in section index.
688    pub fn build_section_index(
689        &mut self,
690        stream_len: u64,
691        size_limit: usize,
692    ) -> Result<Vec<SectionIndexEntry>> {
693        let end_offset = stream_len
694            .checked_sub(self.archive_start())
695            .ok_or(ErrorInner::OffsetOverflow)?;
696
697        let mut offset = 0u64;
698        let mut index = Vec::with_capacity(size_limit / size_of::<SectionIndexEntry>());
699        while offset < end_offset {
700            let header = self.read_header_at(offset)?;
701            let ent = SectionIndexEntry::new(header.section_type, offset)
702                .ok_or(ErrorInner::OffsetOverflow)?;
703            if index.len() == index.capacity() {
704                bail!(ErrorInner::PayloadTooLong {
705                    limit: size_limit,
706                    got: None,
707                });
708            }
709            index.push(ent);
710
711            // We just read the header, so the end of header must not overflows.
712            offset = (offset + HEADER_SIZE)
713                .checked_add(header.payload_size.get())
714                .ok_or(ErrorInner::OffsetOverflow)?;
715        }
716        if offset != end_offset {
717            bail!(std::io::Error::new(
718                std::io::ErrorKind::UnexpectedEof,
719                "unexpected end of file"
720            ));
721        }
722        Ok(index)
723    }
724
725    /// Locate and read the section index, if there is any, with a limited payload size.
726    ///
727    /// `stream_len` is the total size of the input reader `R`, which is
728    /// typically the whole file size.
729    ///
730    /// # Detection behaviors
731    ///
732    /// Since there are currently no reliable way to know if there is a section
733    /// index, the tail could just "looks like an index by chance" or being
734    /// collided to like an index intentionally. Currently we do a best-effort
735    /// detection as follows, but it may change in the future.
736    ///
737    /// 1.  If the header of the first section indicates a DwarFS version
738    ///     without section index support, there must not be an index, and
739    ///     `Ok(None)` is returned.
740    ///
741    /// 2.  Otherwise, read 8 bytes at the end. If it does not look like a valid
742    ///     self-pointing `SectionIndexEntry`, `Ok(None)` is returned.
743    ///
744    /// 3.  If it seems to be valid, follows its offset and read a section
745    ///     header. The header should be like a valid section index capturing
746    ///     the trailing 8 bytes, or `Ok(None)` is returned.
747    ///
748    /// 4.  The content of section index is read. It should have a matched
749    ///     checksum, sorted entries with valid section types. If it all
750    ///     passes, `Ok(Some((header, section_index)))` is returned,
751    ///     otherwise `Ok(None)` is returned.
752    ///
753    ///     This should rule out the possibility that a mocked offset with a
754    ///     mocked section header enclosing multiple real sections inside.
755    ///     Because if there is a valid [`Header`] placed inside section index,
756    ///     the magic-version "DWARFSab" would be interpreted as an invalid
757    ///     section type, causing the index to be rejected.
758    ///
759    /// See more discussion: <https://github.com/mhx/dwarfs/issues/264>
760    ///
761    /// # Errors
762    ///
763    /// Returns `Err` for underlying I/O hard-errors.
764    ///
765    /// `Ok(None)` will be returned instead for soft-errors that occur during
766    /// parsing the may-not-exist section index.
767    #[allow(
768        clippy::missing_panics_doc,
769        reason = "allocation failures are allowed to panic at anytime"
770    )]
771    pub fn read_section_index(
772        &mut self,
773        stream_len: u64,
774        payload_size_limit: usize,
775    ) -> Result<Option<(Header, Vec<SectionIndexEntry>)>> {
776        const INDEX_ENTRY_SIZE64: u64 = size_of::<SectionIndexEntry>() as u64;
777        /// See: <https://github.com/mhx/dwarfs/commit/c103783d4bec8aa658e719c2ed7fe329d1d08676>
778        const SECTION_INDEX_MIN_VERSION: (u8, u8) = (2, 4);
779
780        // 1
781        // The first section must be a valid section. Errors can be directly bubbled.
782        let first_magic = self.read_header_at(0)?.magic_version;
783        if (first_magic.major, first_magic.minor) < SECTION_INDEX_MIN_VERSION {
784            return Ok(None);
785        }
786
787        // 2
788        let mut last_entry = SectionIndexEntry::new_zeroed();
789        self.rdr
790            .read_exact_at(stream_len - INDEX_ENTRY_SIZE64, last_entry.as_mut_bytes())?;
791        if last_entry.section_type() != SectionType::SECTION_INDEX {
792            return Ok(None);
793        }
794
795        // 3
796        // Note that we already checked that this does not overflow.
797        let index_header_offset = last_entry.offset();
798        let Ok(header) = self.read_header_at(index_header_offset) else {
799            // This could be offset overflow, or magic validation failure.
800            return Ok(None);
801        };
802        let payload_size = header.payload_size.get();
803        let num_sections = payload_size / INDEX_ENTRY_SIZE64;
804        // Previous read succeeds, so this cannot overflow.
805        if payload_size != stream_len - index_header_offset - HEADER_SIZE
806            || payload_size % INDEX_ENTRY_SIZE64 != 0
807            || header.section_type != SectionType::SECTION_INDEX
808            || header.compress_algo != CompressAlgo::NONE
809            || u64::from(header.section_number.get()) != num_sections - 1
810        {
811            return Ok(None);
812        }
813
814        // 4
815        if payload_size > payload_size_limit as u64 {
816            bail!(ErrorInner::PayloadTooLong {
817                got: Some(payload_size),
818                limit: payload_size_limit
819            });
820        }
821        // The payload size does not overflow `usize` because of previous `if`,
822        // so it / 8 must do not either.
823        let mut entries =
824            SectionIndexEntry::new_vec_zeroed(num_sections as usize).expect("alloc failed");
825        let buf_bytes = entries.as_mut_bytes();
826        debug_assert_eq!(buf_bytes.len() as u64, payload_size);
827        // We checked the size captures the whole tail without overflow.
828        self.rdr
829            .read_exact_at(index_header_offset + HEADER_SIZE, buf_bytes)?;
830
831        // Here, the section passes attribute precondition test. We are almost
832        // certain an index is indeed present because we cannot "collide" so
833        // many preconditions (especially, the section offset) by chance.
834        //
835        // So below this line, we assume its presence, and emit errors instead
836        // of returning `Ok(None)` to alert users there must be something going
837        // wrong: either the index is broken, or the input is "maliciously"
838        // tricking us.
839
840        header.validate_fast_checksum(buf_bytes)?;
841
842        let mut prev = None;
843        for (i, ent) in entries.iter().enumerate() {
844            let (typ, offset) = (ent.section_type(), ent.offset());
845            if !typ.is_known() {
846                bail!(ErrorInner::MalformedSectionIndex(format!(
847                    "entry {i} has unknown section type {typ:?}",
848                )))
849            }
850            if prev.is_some_and(|prev| prev >= offset) {
851                bail!(ErrorInner::MalformedSectionIndex(format!(
852                    "entry {i} has unsorted offset {offset} >= previous offset {prev:?}",
853                )));
854            }
855            prev = Some(offset)
856        }
857
858        Ok(Some((header, entries)))
859    }
860}
dwarfs/section.rs

dwarfs/
section.rs