ggstd/compress/gzip/gunzip.rs
1// Copyright 2023 The rust-ggstd authors. All rights reserved.
2// Copyright 2009 The Go Authors. All rights reserved.
3// Use of this source code is governed by a BSD-style
4// license that can be found in the LICENSE file.
5
6use crate::compress::flate;
7use crate::encoding::binary::{ByteOrder, LITTLE_ENDIAN};
8use crate::errors;
9use crate::hash::crc32;
10use crate::io as ggio;
11use crate::time;
12
13pub(super) const GZIP_ID1: u8 = 0x1f;
14const GZIP_ID2: u8 = 0x8b;
15const GZIP_DEFLATE: u8 = 8;
16const _FLAG_TEXT: u8 = 1 << 0;
17const FLAG_HDR_CRC: u8 = 1 << 1;
18const FLAG_EXTRA: u8 = 1 << 2;
19const FLAG_NAME: u8 = 1 << 3;
20const FLAG_COMMENT: u8 = 1 << 4;
21
22/// ERR_CHECKSUM_MSG is an error message that will be return as std::io::Error
23/// when the checksum is wrong.
24pub const ERR_CHECKSUM_MSG: &str = "gzip: invalid checksum";
25/// ERR_CHECKSUM_MSG is an error message that will be return as std::io::Error
26/// when the header is wrong.
27pub const ERR_INVALID_HEADER: &str = "gzip: invalid header";
28
29fn get_err_checksum() -> std::io::Error {
30 errors::new_stdio_other_error(ERR_CHECKSUM_MSG.to_string())
31}
32
33fn get_err_invalid_header() -> std::io::Error {
34 errors::new_stdio_other_error(ERR_INVALID_HEADER.to_string())
35}
36
37/// The gzip file stores a header giving metadata about the compressed file.
38/// That header is exposed as the fields of the Writer and Reader structs.
39///
40/// Strings must be UTF-8 encoded and may only contain Unicode code points
41/// U+0001 through U+00FF, due to limitations of the GZIP file format.
42#[derive(Debug)]
43pub struct Header {
44 pub comment: Option<String>, // comment
45 pub extra: Option<Vec<u8>>, // "extra data"
46 pub mod_time: time::Time, // modification time
47 pub name: Option<String>, // file name
48 pub os: u8, // operating system type
49}
50
51struct ReadState {
52 digest: u32, // CRC-32, IEEE polynomial (section 8)
53 buf: Vec<u8>,
54}
55
56/// A Reader can be used to retrieve
57/// uncompressed data from a gzip-format compressed file.
58///
59/// In general, a gzip file can be a concatenation of gzip files,
60/// each with its own header. Reads from the Reader
61/// return the concatenation of the uncompressed data of each.
62/// Only the first header is recorded in the Reader fields.
63///
64/// Gzip files store a length and checksum of the uncompressed data.
65/// The Reader will return an ErrChecksum when Read
66/// reaches the end of the uncompressed data if it does not
67/// have the expected length or checksum. Clients should treat data
68/// returned by Read as tentative until they receive the io.EOF
69/// marking the end of the data.
70pub struct Reader<'a, Input: std::io::BufRead> {
71 pub header: Option<Header>, // valid after Reader::new or Reader.reset
72 read_state: ReadState,
73 decompressor: flate::Reader<&'a mut Input>,
74 size: u32, // Uncompressed size (section 2.3.1)
75 err: Option<std::io::Error>,
76 multistream: bool,
77}
78
79impl<'a, Input: std::io::BufRead> Reader<'a, Input> {
80 /// new creates a new Reader reading the given reader.
81 ///
82 /// Make sure that the reader implements buffering otherwise the performance
83 /// can be low. You can use std::io::BufReader to add buffering to any reader.
84 ///
85 ///
86 // If r does not also implement io.ByteReader,
87 // the decompressor may read more data than necessary from r.
88 //
89 // It is the caller's responsibility to call close on the Reader when done.
90 //
91 /// The Reader.header fields will be valid in the Reader returned.
92 /// If Reader.header is None, then there is no stream available.
93 pub fn new(r: &'a mut Input) -> std::io::Result<Self> {
94 // z := new(Reader)
95 // if err := self.reset(r); err != nil {
96 // return nil, err
97 // }
98 // return z, nil
99
100 let mut read_state = ReadState::new();
101 let header = read_state.read_header(r)?;
102 let decompressor = flate::Reader::new(r);
103
104 Ok(Self {
105 header,
106 decompressor,
107 read_state,
108 size: 0,
109 err: None,
110 multistream: true,
111 })
112 }
113
114 /// reset discards the Reader self's state and makes it equivalent to the
115 /// result of its original state from Reader::new, but reading from r instead.
116 /// This permits reusing a Reader rather than allocating a new one.
117 pub fn reset(&mut self, r: &'a mut Input) -> std::io::Result<()> {
118 self.read_state = ReadState::new();
119 self.header = self.read_state.read_header(r)?;
120 self.decompressor.reset(r, &[]);
121 self.multistream = true;
122 self.size = 0;
123 self.err = None;
124 Ok(())
125 }
126
127 /// reset_state is similar to reset, but reuses the underlying reader.
128 pub fn reset_state(&mut self) -> std::io::Result<()> {
129 self.read_state = ReadState::new();
130 self.header = self
131 .read_state
132 .read_header(self.decompressor.input_reader())?;
133 self.decompressor.reset_state(&[]);
134 self.multistream = true;
135 self.size = 0;
136 self.err = None;
137 Ok(())
138 }
139
140 /// multistream controls whether the reader supports multistream files.
141 ///
142 /// If enabled (the default), the Reader expects the input to be a sequence
143 /// of individually gzipped data streams, each with its own header and
144 /// trailer, ending at EOF. The effect is that the concatenation of a sequence
145 /// of gzipped files is treated as equivalent to the gzip of the concatenation
146 /// of the sequence. This is standard behavior for gzip readers.
147 ///
148 /// Calling Multistream(false) disables this behavior; disabling the behavior
149 /// can be useful when reading file formats that distinguish individual gzip
150 /// data streams or mix gzip data streams with other data streams.
151 /// In this mode, when the Reader reaches the end of the data stream,
152 /// Read returns io.EOF. The underlying reader must implement io.ByteReader
153 /// in order to be left positioned just after the gzip stream.
154 /// To start the next stream, call self.reset(r) followed by self.Multistream(false).
155 /// If there is no next stream, self.reset(r) will return io.EOF.
156 pub fn multistream(&mut self, ok: bool) {
157 self.multistream = ok;
158 }
159
160 /// close closes the Reader. It does not close the underlying io.Reader.
161 /// In order for the GZIP checksum to be verified, the reader must be
162 /// fully consumed until the io.EOF.
163 pub fn close(&mut self) -> std::io::Result<()> {
164 self.decompressor.close()
165 }
166
167 /// is_eof returns true if there are no more gzip steams available.
168 pub fn is_eof(&self) -> bool {
169 self.header.is_none()
170 }
171}
172
173impl<Input: std::io::BufRead> crate::io::Reader for Reader<'_, Input> {
174 /// read implements io.Reader, reading uncompressed bytes from its underlying Reader.
175 fn read(&mut self, p: &mut [u8]) -> ggio::IoRes {
176 let mut n = 0;
177 if self.err.is_some() {
178 return (0, errors::copy_stdio_option_error(&self.err));
179 }
180
181 while n == 0 {
182 if self.is_eof() {
183 return ggio::EOF;
184 }
185 let res = crate::io::Reader::read(&mut self.decompressor, p);
186 self.read_state.digest =
187 crc32::update(self.read_state.digest, &crc32::IEEE_TABLE, &p[..res.0]);
188 self.size += res.0 as u32;
189 if !ggio::is_eof(&res) {
190 return res;
191 }
192 (n, self.err) = res;
193
194 // Finished file; check checksum and size.
195 {
196 let mut buf = [0; 8];
197 if let Err(err) = self.decompressor.input_reader().read_exact(&mut buf) {
198 self.err = Some(err);
199 return (n, errors::copy_stdio_option_error(&self.err));
200 }
201 let digest = LITTLE_ENDIAN.uint32(&buf[..4]);
202 let size = LITTLE_ENDIAN.uint32(&buf[4..8]);
203 if digest != self.read_state.digest || size != self.size {
204 self.err = Some(get_err_checksum());
205 return (n, errors::copy_stdio_option_error(&self.err));
206 }
207 }
208 self.read_state.digest = 0;
209 self.size = 0;
210
211 // File is ok; check if there is another.
212 if !self.multistream {
213 return (n, None);
214 }
215 self.err = None; // Remove io.EOF
216
217 self.read_state = ReadState::new();
218 self.header = match self
219 .read_state
220 .read_header(self.decompressor.input_reader())
221 {
222 Ok(header) => header,
223 Err(err) => {
224 return (n, Some(err));
225 }
226 };
227 self.decompressor.reset_state(&[]);
228 }
229
230 (n, None)
231 }
232}
233
234impl<Input: std::io::BufRead> std::io::Read for Reader<'_, Input> {
235 fn read(&mut self, buf: &mut [u8]) -> std::io::Result<usize> {
236 let res = crate::io::Reader::read(self, buf);
237 if res.0 > 0 {
238 Ok(res.0)
239 } else if ggio::is_eof(&res) {
240 return Ok(0);
241 } else {
242 return Err(res.1.unwrap());
243 }
244 }
245}
246
247impl ReadState {
248 fn new() -> Self {
249 Self {
250 digest: 0,
251 buf: vec![0; 512],
252 }
253 }
254
255 /// read_header reads the GZIP header according to section 2.3.1.
256 /// This method does not set self.err.
257 fn read_header<T: std::io::BufRead>(
258 &mut self,
259 r: &mut T,
260 ) -> Result<Option<Header>, std::io::Error> {
261 // if _, err = io.ReadFull(r, self.buf[..10]); err != nil {
262 // // RFC 1952, section 2.2, says the following:
263 // // A gzip file consists of a series of "members" (compressed data sets).
264 // //
265 // // Other than this, the specification does not clarify whether a
266 // // "series" is defined as "one or more" or "zero or more". To err on the
267 // // side of caution, Go interprets this to mean "zero or more".
268 // // Thus, it is okay to return io.EOF here.
269 // return hdr, err
270 // }
271 let res = ggio::read_full(r, &mut self.buf[..10]);
272 if res.0 == 0 && ggio::is_unexpected_eof(&res) {
273 // nothing was read, there is no header
274 return Ok(None);
275 }
276 if let Some(err) = res.1 {
277 return Err(err);
278 }
279 if self.buf[0] != GZIP_ID1 || self.buf[1] != GZIP_ID2 || self.buf[2] != GZIP_DEFLATE {
280 return Err(get_err_invalid_header());
281 }
282 let flg = self.buf[3];
283 let t = LITTLE_ENDIAN.uint32(&self.buf[4..8]) as i64;
284 let mod_time = if t > 0 {
285 // Section 2.3.1, the zero value for MTIME means that the
286 // modified time is not set.
287 time::unix(t, 0)
288 } else {
289 time::Time::default()
290 };
291 // self.buf[8] is XFL and is currently ignored.
292 let os = self.buf[9];
293 self.digest = crc32::checksum_ieee(&self.buf[..10]);
294
295 let extra_data: Option<Vec<u8>> = if flg & FLAG_EXTRA != 0 {
296 let mut buf = [0; 2];
297 r.read_exact(&mut buf)?;
298 self.digest = crc32::update(self.digest, &crc32::IEEE_TABLE, &buf[..2]);
299 let mut data = vec![0; LITTLE_ENDIAN.uint16(&buf[..2]) as usize];
300 r.read_exact(&mut data)?;
301 self.digest = crc32::update(self.digest, &crc32::IEEE_TABLE, &data);
302 Some(data)
303 } else {
304 None
305 };
306
307 let name = if flg & FLAG_NAME != 0 {
308 Some(self.read_string(r)?)
309 } else {
310 None
311 };
312
313 let comment = if flg & FLAG_COMMENT != 0 {
314 Some(self.read_string(r)?)
315 } else {
316 None
317 };
318
319 if flg & FLAG_HDR_CRC != 0 {
320 r.read_exact(&mut self.buf[..2])?;
321 let digest = LITTLE_ENDIAN.uint16(&self.buf[..2]);
322 if digest != self.digest as u16 {
323 return Err(get_err_invalid_header());
324 }
325 }
326 self.digest = 0;
327 Ok(Some(Header {
328 comment,
329 extra: extra_data,
330 mod_time,
331 name,
332 os,
333 }))
334 }
335
336 /// read_string reads a NUL-terminated string from self.r.
337 /// It treats the bytes read as being encoded as ISO 8859-1 (Latin-1) and
338 /// will output a string encoded using UTF-8.
339 /// This method always updates self.digest with the data read.
340 fn read_string<T: std::io::BufRead>(&mut self, r: &mut T) -> std::io::Result<String> {
341 let mut need_conv = false;
342 let mut i = 0;
343 loop {
344 if i >= self.buf.len() {
345 return Err(get_err_invalid_header());
346 }
347 r.read_exact(&mut self.buf[i..i + 1])?;
348 if self.buf[i] > 0x7f {
349 need_conv = true;
350 }
351 if self.buf[i] == 0 {
352 // Digest covers the NUL terminator.
353 self.digest = crc32::update(self.digest, &crc32::IEEE_TABLE, &self.buf[..i + 1]);
354
355 // Strings are ISO 8859-1, Latin-1 (RFC 1952, section 2.3.1).
356 if need_conv {
357 let mut s = String::new();
358 for ch in &self.buf[..i] {
359 s.push(*ch as char);
360 }
361 return Ok(s);
362 }
363 return Ok(String::from_utf8_lossy(&self.buf[..i]).to_string());
364 }
365 i += 1;
366 }
367 }
368}