Skip to main content

gzip_header/
lib.rs

1//! A library to decode and encode headers for the
2//! [gzip format](http://www.gzip.org/zlib/rfc-gzip.html).
3//! The library also contains a reader absctraction over a CRC checksum hasher.
4//!
5//! A file in the gzip format contains a gzip header, a number of compressed data blocks in the
6//! [DEFLATE](http://www.gzip.org/zlib/rfc-deflate.html) format, and ends with the CRC32-checksum
7//! (in the IEEE format) and number of bytes (modulo `2^32`) of the uncompressed data.
8//!
9//! The gzip header is purely a set of metadata, and doesn't have any impact on the decoding of the
10//! compressed data other than the fact that `DEFLATE`-encoded data with a gzip-header is
11//! checked using the CRC32 algorithm.
12//!
13//! This library is based on the gzip header functionality in the
14//! [flate2](https://crates.io/crates/flate2) crate.
15
16#![forbid(unsafe_code)]
17#![no_std]
18extern crate alloc;
19#[cfg(feature = "std")]
20extern crate std;
21
22mod crc_reader;
23
24use alloc::borrow::Cow;
25use alloc::ffi::CString;
26use core::default::Default;
27use core::fmt;
28#[cfg(feature = "std")]
29use std::io::Read;
30#[cfg(feature = "std")]
31use std::{env, io, time};
32
33pub use crc_reader::Crc;
34#[cfg(feature = "std")]
35pub use crc_reader::CrcReader;
36
37static FHCRC: u8 = 1 << 1;
38static FEXTRA: u8 = 1 << 2;
39static FNAME: u8 = 1 << 3;
40static FCOMMENT: u8 = 1 << 4;
41
42/// An enum describing the different OS types described in the gzip format.
43/// See http://www.gzip.org/format.txt (Additionally, the Apple(19) value is defined in the zlib
44/// library).
45#[derive(Debug, Copy, Clone, PartialEq, Eq, Hash)]
46#[repr(u8)]
47pub enum FileSystemType {
48    ///MS-DOS/old FAT filesystem
49    Fat = 0,
50    Amiga = 1,
51    Vms = 2,
52    Unix = 3,
53    Vcms = 4,
54    AtariTos = 5,
55    Hpfs = 6,
56    /// Used for apple platforms. Newer encoders may use 19 instead for modern systems.
57    Macintosh = 7,
58    Zsystem = 8,
59    Cpm = 9,
60    /// This is used for Windows/NTFS in zlib newer than 1.2.11, but not in gzip due to following
61    /// updates to the ZIP format.
62    /// See https://github.com/madler/zlib/issues/235 and
63    /// https://github.com/madler/zlib/commit/ce12c5cd00628bf8f680c98123a369974d32df15
64    Tops20OrNTFS = 10,
65    /// Used for Windows platforms for older zlib versions and other encoders.
66    NTFS = 11,
67    SmsQdos = 12,
68    Riscos = 13,
69    /// Newer fat filesystems (i.e FAT32).
70    Vfat = 14,
71    Mvs = 15,
72    Beos = 16,
73    TandemNsk = 17,
74    Theos = 18,
75    /// Modern apple platforms.
76    /// Defined in the zlib library (see zutil.h)
77    Apple = 19,
78    Unknown = 255,
79}
80
81impl FileSystemType {
82    /// Get the raw byte value of this `FileSystemType` variant.
83    pub const fn as_u8(&self) -> u8 {
84        *self as u8
85    }
86
87    /// Get the corresponding `ExtraFlags` value from a raw byte.
88    ///
89    /// Returns `FileSystemType::Unknown` (defined as 255 as that is the value used in the
90    /// specification for `Unknown`) if the value is not one of the currently known types
91    /// (Which currently means any value > 19).
92    pub fn from_u8(value: u8) -> FileSystemType {
93        use FileSystemType::*;
94        match value {
95            0 => Fat,
96            1 => Amiga,
97            2 => Vms,
98            3 => Unix,
99            4 => Vcms,
100            5 => AtariTos,
101            6 => Hpfs,
102            7 => Macintosh,
103            8 => Zsystem,
104            9 => Cpm,
105            10 => Tops20OrNTFS,
106            11 => NTFS,
107            12 => SmsQdos,
108            13 => Riscos,
109            14 => Vfat,
110            15 => Mvs,
111            16 => Beos,
112            17 => TandemNsk,
113            18 => Theos,
114            19 => Apple,
115            _ => Unknown,
116        }
117    }
118}
119
120impl fmt::Display for FileSystemType {
121    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
122        use FileSystemType::*;
123        match *self {
124            Fat => "FAT filesystem (MS-DOS, OS/2, NT/Win32)",
125            Amiga => "Amiga",
126            Vms => "VMS or OpenVMS",
127            Unix => "Unix type system/Linux",
128            Vcms => "VM/CMS",
129            AtariTos => "Atari TOS",
130            Hpfs => "HPFS filesystem (OS/2, NT)",
131            Macintosh => "Macintosh operating system (Classic Mac OS, OS/X, macOS, iOS etc.)",
132            Zsystem => "Z-System",
133            Cpm => "CP/M",
134            Tops20OrNTFS => "NTFS (New zlib versions) or TOPS-20",
135            NTFS => "NTFS",
136            SmsQdos => "SMS/QDOS",
137            Riscos => "Acorn RISC OS",
138            Vfat => "VFAT file system (Win95, NT)",
139            Mvs => "MVS or PRIMOS",
140            Beos => "BeOS",
141            TandemNsk => "Tandem/NSK",
142            Theos => "THEOS",
143            Apple => "macOS, OS/X, iOS or watchOS",
144            _ => "Unknown or unset",
145        }
146        .fmt(f)
147    }
148}
149
150/// Valid values for the extra flag in the gzip specification.
151///
152/// This is a field to be used by the compression methods. For deflate, which is the only
153/// specified compression method, this is a value indicating the level of compression of the
154/// contained compressed data. This value does not have to correspond to the actual compression
155/// level of the contained data, it's only a hint that the the encoder may set.
156#[derive(Default, Debug, Copy, Clone, PartialEq, Eq, Hash)]
157#[repr(u8)]
158pub enum ExtraFlags {
159    #[default]
160    Default = 0,
161    MaximumCompression = 2,
162    FastestCompression = 4,
163}
164
165impl ExtraFlags {
166    /// Get the corresponding `ExtraFlags` value from a raw byte.
167    ///
168    /// Returns `ExtraFlags::Default` (defined as 0 by the gzip specification) for values other than
169    /// 2 and 4.
170    pub fn from_u8(value: u8) -> ExtraFlags {
171        use ExtraFlags::*;
172        match value {
173            2 => MaximumCompression,
174            4 => FastestCompression,
175            _ => Default,
176        }
177    }
178
179    /// Get the raw byte value of this `ExtraFlags` variant.
180    pub const fn as_u8(&self) -> u8 {
181        *self as u8
182    }
183}
184
185impl fmt::Display for ExtraFlags {
186    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
187        match *self {
188            ExtraFlags::Default => "No extra flags (Default) or unknown.",
189            ExtraFlags::MaximumCompression => "Maximum compression algorithm (DEFLATE).",
190            ExtraFlags::FastestCompression => "Fastest compression algorithm (DEFLATE)",
191        }
192        .fmt(f)
193    }
194}
195
196/// A builder structure to create a new gzip header.
197///
198/// This structure controls header configuration options such as the filename.
199#[derive(Debug, Default, Clone, Eq, PartialEq)]
200pub struct GzBuilder {
201    extra: Option<alloc::vec::Vec<u8>>,
202    filename: Option<CString>,
203    comment: Option<CString>,
204    // Whether this should be signed is a bit unclear, the gzip spec says mtime is in the unix
205    // time format, which is normally signed, however zlib seems to use an unsigned long for this
206    // field.
207    mtime: u32,
208    os: Option<FileSystemType>,
209    xfl: ExtraFlags,
210}
211
212impl GzBuilder {
213    /// Create a new blank builder with no header by default.
214    pub fn new() -> GzBuilder {
215        GzBuilder {
216            extra: None,
217            filename: None,
218            comment: None,
219            mtime: 0,
220            os: None,
221            xfl: ExtraFlags::Default,
222        }
223    }
224
225    /// Configure the `mtime` field in the gzip header.
226    pub fn mtime(mut self, mtime: u32) -> GzBuilder {
227        self.mtime = mtime;
228        self
229    }
230
231    /// Configure the `extra` field in the gzip header.
232    pub fn extra<T: Into<alloc::vec::Vec<u8>>>(mut self, extra: T) -> GzBuilder {
233        self.extra = Some(extra.into());
234        self
235    }
236
237    /// Configure the `filename` field in the gzip header.
238    ///
239    /// # Panics
240    /// Panics if the filename argument contains a byte with the value 0.
241    pub fn filename<T: Into<alloc::vec::Vec<u8>>>(mut self, filename: T) -> GzBuilder {
242        self.filename = Some(CString::new(filename).unwrap());
243        self
244    }
245
246    /// Configure the `comment` field in the gzip header.
247    ///
248    /// # Panics
249    /// Panics if the comment argument contains a byte with the value 0.
250    pub fn comment<T: Into<alloc::vec::Vec<u8>>>(mut self, comment: T) -> GzBuilder {
251        self.comment = Some(CString::new(comment).unwrap());
252        self
253    }
254
255    /// Configure the `os` field in the gzip header.
256    ///
257    /// This is taken from `std::env::consts::OS` if not set explicitly.
258    pub fn os(mut self, os: FileSystemType) -> GzBuilder {
259        self.os = Some(os);
260        self
261    }
262
263    /// Configure the `xfl` field in the gzip header.
264    ///
265    /// The default is `ExtraFlags::Default` (meaning not set).
266    pub fn xfl(mut self, xfl: ExtraFlags) -> GzBuilder {
267        self.xfl = xfl;
268        self
269    }
270
271    /// Transforms this builder structure into a raw vector of bytes, setting the `XFL` field to the
272    /// value specified by `lvl`.
273    pub fn into_header_xfl(mut self, lvl: ExtraFlags) -> alloc::vec::Vec<u8> {
274        self.xfl = lvl;
275        self.into_header()
276    }
277
278    /// Transforms this builder structure into a raw vector of bytes.
279    pub fn into_header(self) -> alloc::vec::Vec<u8> {
280        self.into_header_inner(false)
281    }
282
283    /// Transforms this builder structure into a raw vector of bytes.
284    pub fn into_header_with_checksum(self) -> alloc::vec::Vec<u8> {
285        self.into_header_inner(true)
286    }
287
288    fn into_header_inner(self, use_crc: bool) -> alloc::vec::Vec<u8> {
289        let GzBuilder {
290            extra,
291            filename,
292            comment,
293            mtime,
294            os,
295            xfl,
296        } = self;
297        let os = match os {
298            Some(f) => f,
299            // Set the OS based on the system the binary is compiled for if not set,
300            // as this is a required field.
301            // These defaults are taken from what modern zlib uses, which are not the same as
302            // what's used in flate2.
303            None => {
304                #[cfg(feature = "std")]
305                match env::consts::OS {
306                    "linux" | "freebsd" | "dragonfly" | "netbsd" | "openbsd" | "solaris"
307                    | "bitrig" => FileSystemType::Unix,
308                    "macos" => FileSystemType::Apple,
309                    "win32" => FileSystemType::Tops20OrNTFS,
310                    _ => FileSystemType::Unknown,
311                }
312                #[cfg(not(feature = "std"))]
313                FileSystemType::Unknown
314            }
315        };
316        let mut flg = 0;
317        if use_crc {
318            flg |= FHCRC;
319        };
320        let mut header = alloc::vec![0u8; 10];
321
322        if let Some(v) = extra {
323            flg |= FEXTRA;
324            header.push((v.len()/* >> 0*/) as u8);
325            header.push((v.len() >> 8) as u8);
326            header.extend(v);
327        }
328
329        if let Some(filename) = filename {
330            flg |= FNAME;
331            header.extend(filename.as_bytes_with_nul().iter().cloned());
332        }
333
334        if let Some(comment) = comment {
335            flg |= FCOMMENT;
336            header.extend(comment.as_bytes_with_nul().iter().cloned());
337        }
338
339        header[0] = 0x1f;
340        header[1] = 0x8b;
341        header[2] = 8;
342        header[3] = flg;
343        header[4] = mtime /*>> 0*/ as u8;
344        header[5] = (mtime >> 8) as u8;
345        header[6] = (mtime >> 16) as u8;
346        header[7] = (mtime >> 24) as u8;
347        header[8] = xfl.as_u8();
348        header[9] = os.as_u8();
349
350        if use_crc {
351            let mut crc = Crc::new();
352            crc.update(&header);
353            let checksum = crc.sum() as u16;
354            header.extend(&[checksum as u8, (checksum >> 8) as u8]);
355        }
356
357        header
358    }
359}
360
361/// A structure representing the raw header of a gzip stream.
362///
363/// The header can contain metadata about the file that was compressed, if
364/// present.
365#[derive(Debug, Clone, PartialEq, Eq, Hash)]
366pub struct GzHeader {
367    extra: Option<alloc::vec::Vec<u8>>,
368    filename: Option<alloc::vec::Vec<u8>>,
369    comment: Option<alloc::vec::Vec<u8>>,
370    mtime: u32,
371    os: u8,
372    xfl: u8,
373}
374
375impl GzHeader {
376    /// Returns the `filename` field of this gzip header, if present.
377    ///
378    /// The `filename` field the gzip header is supposed to be stored using ISO 8859-1 (LATIN-1)
379    /// encoding and be zero-terminated if following the specification.
380    pub fn filename(&self) -> Option<&[u8]> {
381        self.filename.as_ref().map(|s| &s[..])
382    }
383
384    /// Returns the `extra` field of this gzip header, if present.
385    pub fn extra(&self) -> Option<&[u8]> {
386        self.extra.as_ref().map(|s| &s[..])
387    }
388
389    /// Returns the `comment` field of this gzip stream's header, if present.
390    ///
391    /// The `comment` field in the gzip header is supposed to be stored using ISO 8859-1 (LATIN-1)
392    /// encoding and be zero-terminated if following the specification.
393    pub fn comment(&self) -> Option<&[u8]> {
394        self.comment.as_ref().map(|s| &s[..])
395    }
396
397    /// Returns the `mtime` field of this gzip header.
398    ///
399    /// This gives the most recent modification time of the contained file, or alternatively
400    /// the timestamp of when the file was compressed if the data did not come from a file, or
401    /// a timestamp was not available when compressing. The time is specified the Unix format,
402    /// that is: seconds since 00:00:00 GMT, Jan. 1, 1970. (Not that this may cause problems for
403    /// MS-DOS and other systems that use local rather than Universal time.)
404    /// An `mtime` value of 0 means that the timestamp is not set.
405    pub const fn mtime(&self) -> u32 {
406        self.mtime
407    }
408
409    /// Returns the `mtime` field of this gzip header as a `SystemTime` if present.
410    ///
411    /// Returns `None` if the `mtime` is not set, i.e 0.
412    /// See [`mtime`](#method.mtime) for more detail.
413    #[cfg(feature = "std")]
414    pub fn mtime_as_datetime(&self) -> Option<time::SystemTime> {
415        if self.mtime == 0 {
416            None
417        } else {
418            let duration = time::Duration::new(u64::from(self.mtime), 0);
419            let datetime = time::UNIX_EPOCH + duration;
420            Some(datetime)
421        }
422    }
423
424    /// Returns the `os` field of this gzip stream's header.
425    pub const fn os(&self) -> u8 {
426        self.os
427    }
428
429    /// Returns the `xfl` field of this gzip stream's header.
430    pub const fn xfl(&self) -> u8 {
431        self.xfl
432    }
433}
434
435#[inline]
436fn into_string<'a>(data: Option<&'a [u8]>) -> alloc::borrow::Cow<'a, str> {
437    data.map_or_else(
438        || Cow::Borrowed("(Not set)"),
439        |d| alloc::string::String::from_utf8_lossy(d),
440    )
441}
442
443impl fmt::Display for GzHeader {
444    /// Crudely display the contents of the header
445    ///
446    /// Note that filename/commend are required to be ISO 8859-1 (LATIN-1) encoded by the spec,
447    /// however to avoid dragging in dependencies we simply interpret them as UTF-8.
448    /// This may result in garbled output if the names contain special characters.
449    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
450        write!(
451            f,
452            "Filename: {}\n\
453             Comment: {}\n\
454             Extra: {:?}\n\
455             mtime: {}\n\
456             os: {}\n\
457             xfl: {}",
458            into_string(self.filename()),
459            into_string(self.comment()),
460            // We display extra as raw bytes for now.
461            self.extra,
462            self.mtime,
463            FileSystemType::from_u8(self.os),
464            ExtraFlags::Default, //ExtraFlags::from_u8(self.xfl),
465        )
466    }
467}
468
469#[cfg(feature = "std")]
470fn corrupt() -> io::Error {
471    io::Error::new(
472        io::ErrorKind::InvalidInput,
473        "corrupt gzip stream does not have a matching header checksum",
474    )
475}
476
477#[cfg(feature = "std")]
478fn bad_header() -> io::Error {
479    io::Error::new(io::ErrorKind::InvalidInput, "invalid gzip header")
480}
481
482/// Try to read a little-endian u16 from the provided reader.
483#[cfg(feature = "std")]
484fn read_le_u16<R: Read>(r: &mut R) -> io::Result<u16> {
485    let mut b = [0; 2];
486    r.read_exact(&mut b)?;
487    Ok((b[0] as u16) | ((b[1] as u16) << 8))
488}
489
490/// Try to read a gzip header from the provided reader.
491///
492/// Returns a `GzHeader` with the fields filled out if sucessful, or an `io::Error` with
493/// `ErrorKind::InvalidInput` if decoding of the header.
494///
495/// Note that a gzip steam can contain multiple "members". Each member contains a header,
496/// followed by compressed data and finally a checksum and byte count.
497/// This method will only read the header for the "member" at the start of the stream.
498#[cfg(feature = "std")]
499pub fn read_gz_header<R: Read>(r: &mut R) -> io::Result<GzHeader> {
500    let mut crc_reader = CrcReader::new(r);
501    let mut header = [0; 10];
502    crc_reader.read_exact(&mut header)?;
503
504    // `ID1` and `ID2` are fixed values to identify a gzip file.
505    let id1 = header[0];
506    let id2 = header[1];
507    if id1 != 0x1f || id2 != 0x8b {
508        return Err(bad_header());
509    }
510    // `CM` describes the compression method. Currently only method 8 (DEFLATE) is specified.
511    // by the gzip format.
512    let cm = header[2];
513    if cm != 8 {
514        return Err(bad_header());
515    }
516
517    // `FLG` the bits in this field indicates whether the `FTEXT`, `FHCRC`, `FEXTRA`, `FNAME` and
518    // `FCOMMENT` fields are present in the header.
519    let flg = header[3];
520    let mtime = (header[4] as u32/* << 0*/)
521        | ((header[5] as u32) << 8)
522        | ((header[6] as u32) << 16)
523        | ((header[7] as u32) << 24);
524    // `XFL` describes the compression level used by the encoder. (May not actually
525    // match what the encoder used and has no impact on decompression.)
526    let xfl = header[8];
527    // `os` describes what type of operating system/file system the file was created on.
528    let os = header[9];
529
530    let extra = if flg & FEXTRA != 0 {
531        // Length of the FEXTRA field.
532        let xlen = read_le_u16(&mut crc_reader)?;
533        let mut extra = alloc::vec![0; xlen as usize];
534        crc_reader.read_exact(&mut extra)?;
535        Some(extra)
536    } else {
537        None
538    };
539    let filename = if flg & FNAME != 0 {
540        // wow this is slow
541        let mut b = alloc::vec::Vec::new();
542        for byte in crc_reader.by_ref().bytes() {
543            let byte = byte?;
544            if byte == 0 {
545                break;
546            }
547            b.push(byte);
548        }
549        Some(b)
550    } else {
551        None
552    };
553    let comment = if flg & FCOMMENT != 0 {
554        // wow this is slow
555        let mut b = alloc::vec::Vec::new();
556        for byte in crc_reader.by_ref().bytes() {
557            let byte = byte?;
558            if byte == 0 {
559                break;
560            }
561            b.push(byte);
562        }
563        Some(b)
564    } else {
565        None
566    };
567
568    // If the `FHCRC` flag is set, the header contains a two-byte CRC16 checksum of the header bytes
569    // that needs to be validated.
570    if flg & FHCRC != 0 {
571        let calced_crc = crc_reader.crc().sum() as u16;
572        let stored_crc = read_le_u16(&mut crc_reader)?;
573        if calced_crc != stored_crc {
574            return Err(corrupt());
575        }
576    }
577
578    Ok(GzHeader {
579        extra,
580        filename,
581        comment,
582        mtime,
583        os,
584        xfl,
585    })
586}
587
588#[cfg(test)]
589mod tests {
590    extern crate std;
591    use super::*;
592    use std::io::Cursor;
593
594    fn roundtrip_inner(use_crc: bool) {
595        const COMMENT: &[u8] = b"Comment";
596        const FILENAME: &[u8] = b"Filename";
597        const MTIME: u32 = 12345;
598        const OS: FileSystemType = FileSystemType::NTFS;
599        const XFL: ExtraFlags = ExtraFlags::FastestCompression;
600
601        let header = GzBuilder::new()
602            .comment(COMMENT)
603            .filename(FILENAME)
604            .mtime(MTIME)
605            .os(OS)
606            .xfl(ExtraFlags::FastestCompression)
607            .into_header_inner(use_crc);
608
609        let mut reader = Cursor::new(header.clone());
610
611        let header_read = read_gz_header(&mut reader).unwrap();
612
613        assert_eq!(header_read.comment().unwrap(), COMMENT);
614        assert_eq!(header_read.filename().unwrap(), FILENAME);
615        assert_eq!(header_read.mtime(), MTIME);
616        assert_eq!(header_read.os(), OS.as_u8());
617        assert_eq!(header_read.xfl(), XFL.as_u8());
618    }
619
620    #[test]
621    fn roundtrip() {
622        roundtrip_inner(false);
623    }
624
625    #[test]
626    fn roundtrip_with_crc() {
627        roundtrip_inner(true);
628    }
629
630    #[test]
631    fn filesystem_enum() {
632        for n in 0..20 {
633            assert_eq!(n, FileSystemType::from_u8(n).as_u8());
634        }
635
636        for n in 20..(u8::MAX as u16) + 1 {
637            assert_eq!(FileSystemType::from_u8(n as u8), FileSystemType::Unknown);
638        }
639    }
640}