ggstd/compress/gzip/
gunzip.rs

1// Copyright 2023 The rust-ggstd authors. All rights reserved.
2// Copyright 2009 The Go Authors. All rights reserved.
3// Use of this source code is governed by a BSD-style
4// license that can be found in the LICENSE file.
5
6use crate::compress::flate;
7use crate::encoding::binary::{ByteOrder, LITTLE_ENDIAN};
8use crate::errors;
9use crate::hash::crc32;
10use crate::io as ggio;
11use crate::time;
12
13pub(super) const GZIP_ID1: u8 = 0x1f;
14const GZIP_ID2: u8 = 0x8b;
15const GZIP_DEFLATE: u8 = 8;
16const _FLAG_TEXT: u8 = 1 << 0;
17const FLAG_HDR_CRC: u8 = 1 << 1;
18const FLAG_EXTRA: u8 = 1 << 2;
19const FLAG_NAME: u8 = 1 << 3;
20const FLAG_COMMENT: u8 = 1 << 4;
21
22/// ERR_CHECKSUM_MSG is an error message that will be return as std::io::Error
23/// when the checksum is wrong.
24pub const ERR_CHECKSUM_MSG: &str = "gzip: invalid checksum";
25/// ERR_CHECKSUM_MSG is an error message that will be return as std::io::Error
26/// when the header is wrong.
27pub const ERR_INVALID_HEADER: &str = "gzip: invalid header";
28
29fn get_err_checksum() -> std::io::Error {
30    errors::new_stdio_other_error(ERR_CHECKSUM_MSG.to_string())
31}
32
33fn get_err_invalid_header() -> std::io::Error {
34    errors::new_stdio_other_error(ERR_INVALID_HEADER.to_string())
35}
36
37/// The gzip file stores a header giving metadata about the compressed file.
38/// That header is exposed as the fields of the Writer and Reader structs.
39///
40/// Strings must be UTF-8 encoded and may only contain Unicode code points
41/// U+0001 through U+00FF, due to limitations of the GZIP file format.
42#[derive(Debug)]
43pub struct Header {
44    pub comment: Option<String>, // comment
45    pub extra: Option<Vec<u8>>,  // "extra data"
46    pub mod_time: time::Time,    // modification time
47    pub name: Option<String>,    // file name
48    pub os: u8,                  // operating system type
49}
50
51struct ReadState {
52    digest: u32, // CRC-32, IEEE polynomial (section 8)
53    buf: Vec<u8>,
54}
55
56/// A Reader can be used to retrieve
57/// uncompressed data from a gzip-format compressed file.
58///
59/// In general, a gzip file can be a concatenation of gzip files,
60/// each with its own header. Reads from the Reader
61/// return the concatenation of the uncompressed data of each.
62/// Only the first header is recorded in the Reader fields.
63///
64/// Gzip files store a length and checksum of the uncompressed data.
65/// The Reader will return an ErrChecksum when Read
66/// reaches the end of the uncompressed data if it does not
67/// have the expected length or checksum. Clients should treat data
68/// returned by Read as tentative until they receive the io.EOF
69/// marking the end of the data.
70pub struct Reader<'a, Input: std::io::BufRead> {
71    pub header: Option<Header>, // valid after Reader::new or Reader.reset
72    read_state: ReadState,
73    decompressor: flate::Reader<&'a mut Input>,
74    size: u32, // Uncompressed size (section 2.3.1)
75    err: Option<std::io::Error>,
76    multistream: bool,
77}
78
79impl<'a, Input: std::io::BufRead> Reader<'a, Input> {
80    /// new creates a new Reader reading the given reader.
81    ///
82    /// Make sure that the reader implements buffering otherwise the performance
83    /// can be low.  You can use std::io::BufReader to add buffering to any reader.
84    ///
85    ///
86    // If r does not also implement io.ByteReader,
87    // the decompressor may read more data than necessary from r.
88    //
89    // It is the caller's responsibility to call close on the Reader when done.
90    //
91    /// The Reader.header fields will be valid in the Reader returned.
92    /// If Reader.header is None, then there is no stream available.
93    pub fn new(r: &'a mut Input) -> std::io::Result<Self> {
94        // 	z := new(Reader)
95        // 	if err := self.reset(r); err != nil {
96        // 		return nil, err
97        // 	}
98        // 	return z, nil
99
100        let mut read_state = ReadState::new();
101        let header = read_state.read_header(r)?;
102        let decompressor = flate::Reader::new(r);
103
104        Ok(Self {
105            header,
106            decompressor,
107            read_state,
108            size: 0,
109            err: None,
110            multistream: true,
111        })
112    }
113
114    /// reset discards the Reader self's state and makes it equivalent to the
115    /// result of its original state from Reader::new, but reading from r instead.
116    /// This permits reusing a Reader rather than allocating a new one.
117    pub fn reset(&mut self, r: &'a mut Input) -> std::io::Result<()> {
118        self.read_state = ReadState::new();
119        self.header = self.read_state.read_header(r)?;
120        self.decompressor.reset(r, &[]);
121        self.multistream = true;
122        self.size = 0;
123        self.err = None;
124        Ok(())
125    }
126
127    /// reset_state is similar to reset, but reuses the underlying reader.
128    pub fn reset_state(&mut self) -> std::io::Result<()> {
129        self.read_state = ReadState::new();
130        self.header = self
131            .read_state
132            .read_header(self.decompressor.input_reader())?;
133        self.decompressor.reset_state(&[]);
134        self.multistream = true;
135        self.size = 0;
136        self.err = None;
137        Ok(())
138    }
139
140    /// multistream controls whether the reader supports multistream files.
141    ///
142    /// If enabled (the default), the Reader expects the input to be a sequence
143    /// of individually gzipped data streams, each with its own header and
144    /// trailer, ending at EOF. The effect is that the concatenation of a sequence
145    /// of gzipped files is treated as equivalent to the gzip of the concatenation
146    /// of the sequence. This is standard behavior for gzip readers.
147    ///
148    /// Calling Multistream(false) disables this behavior; disabling the behavior
149    /// can be useful when reading file formats that distinguish individual gzip
150    /// data streams or mix gzip data streams with other data streams.
151    /// In this mode, when the Reader reaches the end of the data stream,
152    /// Read returns io.EOF. The underlying reader must implement io.ByteReader
153    /// in order to be left positioned just after the gzip stream.
154    /// To start the next stream, call self.reset(r) followed by self.Multistream(false).
155    /// If there is no next stream, self.reset(r) will return io.EOF.
156    pub fn multistream(&mut self, ok: bool) {
157        self.multistream = ok;
158    }
159
160    /// close closes the Reader. It does not close the underlying io.Reader.
161    /// In order for the GZIP checksum to be verified, the reader must be
162    /// fully consumed until the io.EOF.
163    pub fn close(&mut self) -> std::io::Result<()> {
164        self.decompressor.close()
165    }
166
167    /// is_eof returns true if there are no more gzip steams available.
168    pub fn is_eof(&self) -> bool {
169        self.header.is_none()
170    }
171}
172
173impl<Input: std::io::BufRead> crate::io::Reader for Reader<'_, Input> {
174    /// read implements io.Reader, reading uncompressed bytes from its underlying Reader.
175    fn read(&mut self, p: &mut [u8]) -> ggio::IoRes {
176        let mut n = 0;
177        if self.err.is_some() {
178            return (0, errors::copy_stdio_option_error(&self.err));
179        }
180
181        while n == 0 {
182            if self.is_eof() {
183                return ggio::EOF;
184            }
185            let res = crate::io::Reader::read(&mut self.decompressor, p);
186            self.read_state.digest =
187                crc32::update(self.read_state.digest, &crc32::IEEE_TABLE, &p[..res.0]);
188            self.size += res.0 as u32;
189            if !ggio::is_eof(&res) {
190                return res;
191            }
192            (n, self.err) = res;
193
194            // Finished file; check checksum and size.
195            {
196                let mut buf = [0; 8];
197                if let Err(err) = self.decompressor.input_reader().read_exact(&mut buf) {
198                    self.err = Some(err);
199                    return (n, errors::copy_stdio_option_error(&self.err));
200                }
201                let digest = LITTLE_ENDIAN.uint32(&buf[..4]);
202                let size = LITTLE_ENDIAN.uint32(&buf[4..8]);
203                if digest != self.read_state.digest || size != self.size {
204                    self.err = Some(get_err_checksum());
205                    return (n, errors::copy_stdio_option_error(&self.err));
206                }
207            }
208            self.read_state.digest = 0;
209            self.size = 0;
210
211            // File is ok; check if there is another.
212            if !self.multistream {
213                return (n, None);
214            }
215            self.err = None; // Remove io.EOF
216
217            self.read_state = ReadState::new();
218            self.header = match self
219                .read_state
220                .read_header(self.decompressor.input_reader())
221            {
222                Ok(header) => header,
223                Err(err) => {
224                    return (n, Some(err));
225                }
226            };
227            self.decompressor.reset_state(&[]);
228        }
229
230        (n, None)
231    }
232}
233
234impl<Input: std::io::BufRead> std::io::Read for Reader<'_, Input> {
235    fn read(&mut self, buf: &mut [u8]) -> std::io::Result<usize> {
236        let res = crate::io::Reader::read(self, buf);
237        if res.0 > 0 {
238            Ok(res.0)
239        } else if ggio::is_eof(&res) {
240            return Ok(0);
241        } else {
242            return Err(res.1.unwrap());
243        }
244    }
245}
246
247impl ReadState {
248    fn new() -> Self {
249        Self {
250            digest: 0,
251            buf: vec![0; 512],
252        }
253    }
254
255    /// read_header reads the GZIP header according to section 2.3.1.
256    /// This method does not set self.err.
257    fn read_header<T: std::io::BufRead>(
258        &mut self,
259        r: &mut T,
260    ) -> Result<Option<Header>, std::io::Error> {
261        // 	if _, err = io.ReadFull(r, self.buf[..10]); err != nil {
262        // 		// RFC 1952, section 2.2, says the following:
263        // 		//	A gzip file consists of a series of "members" (compressed data sets).
264        // 		//
265        // 		// Other than this, the specification does not clarify whether a
266        // 		// "series" is defined as "one or more" or "zero or more". To err on the
267        // 		// side of caution, Go interprets this to mean "zero or more".
268        // 		// Thus, it is okay to return io.EOF here.
269        // 		return hdr, err
270        // 	}
271        let res = ggio::read_full(r, &mut self.buf[..10]);
272        if res.0 == 0 && ggio::is_unexpected_eof(&res) {
273            // nothing was read, there is no header
274            return Ok(None);
275        }
276        if let Some(err) = res.1 {
277            return Err(err);
278        }
279        if self.buf[0] != GZIP_ID1 || self.buf[1] != GZIP_ID2 || self.buf[2] != GZIP_DEFLATE {
280            return Err(get_err_invalid_header());
281        }
282        let flg = self.buf[3];
283        let t = LITTLE_ENDIAN.uint32(&self.buf[4..8]) as i64;
284        let mod_time = if t > 0 {
285            // Section 2.3.1, the zero value for MTIME means that the
286            // modified time is not set.
287            time::unix(t, 0)
288        } else {
289            time::Time::default()
290        };
291        // self.buf[8] is XFL and is currently ignored.
292        let os = self.buf[9];
293        self.digest = crc32::checksum_ieee(&self.buf[..10]);
294
295        let extra_data: Option<Vec<u8>> = if flg & FLAG_EXTRA != 0 {
296            let mut buf = [0; 2];
297            r.read_exact(&mut buf)?;
298            self.digest = crc32::update(self.digest, &crc32::IEEE_TABLE, &buf[..2]);
299            let mut data = vec![0; LITTLE_ENDIAN.uint16(&buf[..2]) as usize];
300            r.read_exact(&mut data)?;
301            self.digest = crc32::update(self.digest, &crc32::IEEE_TABLE, &data);
302            Some(data)
303        } else {
304            None
305        };
306
307        let name = if flg & FLAG_NAME != 0 {
308            Some(self.read_string(r)?)
309        } else {
310            None
311        };
312
313        let comment = if flg & FLAG_COMMENT != 0 {
314            Some(self.read_string(r)?)
315        } else {
316            None
317        };
318
319        if flg & FLAG_HDR_CRC != 0 {
320            r.read_exact(&mut self.buf[..2])?;
321            let digest = LITTLE_ENDIAN.uint16(&self.buf[..2]);
322            if digest != self.digest as u16 {
323                return Err(get_err_invalid_header());
324            }
325        }
326        self.digest = 0;
327        Ok(Some(Header {
328            comment,
329            extra: extra_data,
330            mod_time,
331            name,
332            os,
333        }))
334    }
335
336    /// read_string reads a NUL-terminated string from self.r.
337    /// It treats the bytes read as being encoded as ISO 8859-1 (Latin-1) and
338    /// will output a string encoded using UTF-8.
339    /// This method always updates self.digest with the data read.
340    fn read_string<T: std::io::BufRead>(&mut self, r: &mut T) -> std::io::Result<String> {
341        let mut need_conv = false;
342        let mut i = 0;
343        loop {
344            if i >= self.buf.len() {
345                return Err(get_err_invalid_header());
346            }
347            r.read_exact(&mut self.buf[i..i + 1])?;
348            if self.buf[i] > 0x7f {
349                need_conv = true;
350            }
351            if self.buf[i] == 0 {
352                // Digest covers the NUL terminator.
353                self.digest = crc32::update(self.digest, &crc32::IEEE_TABLE, &self.buf[..i + 1]);
354
355                // Strings are ISO 8859-1, Latin-1 (RFC 1952, section 2.3.1).
356                if need_conv {
357                    let mut s = String::new();
358                    for ch in &self.buf[..i] {
359                        s.push(*ch as char);
360                    }
361                    return Ok(s);
362                }
363                return Ok(String::from_utf8_lossy(&self.buf[..i]).to_string());
364            }
365            i += 1;
366        }
367    }
368}