rc_zip/parse/
archive.rs

1use chrono::{offset::Utc, DateTime, TimeZone};
2use ownable::{IntoOwned, ToOwned};
3use winnow::{binary::le_u16, PResult, Partial};
4
5use crate::{
6    encoding::Encoding,
7    parse::{Mode, Version},
8};
9
10use super::{zero_datetime, ExtraField, NtfsAttr};
11
12/// An Archive contains general information about a zip file, along with a list
13/// of [entries][Entry].
14///
15/// It is obtained through a state machine like
16/// [ArchiveFsm](crate::fsm::ArchiveFsm), although end-users tend to use
17/// higher-level interfaces like
18/// [rc-zip-sync](https://crates.io/crates/rc-zip-sync) or
19/// [rc-zip-tokio](https://crates.io/crates/rc-zip-tokio).
20pub struct Archive {
21    pub(crate) size: u64,
22    pub(crate) encoding: Encoding,
23    pub(crate) entries: Vec<Entry>,
24    pub(crate) comment: String,
25}
26
27impl Archive {
28    /// The size of .zip file that was read, in bytes.
29    #[inline(always)]
30    pub fn size(&self) -> u64 {
31        self.size
32    }
33
34    /// Iterate over all files in this zip, read from the central directory.
35    pub fn entries(&self) -> impl Iterator<Item = &Entry> {
36        self.entries.iter()
37    }
38
39    /// Attempts to look up an entry by name. This is usually a bad idea,
40    /// as names aren't necessarily normalized in zip archives.
41    pub fn by_name<N: AsRef<str>>(&self, name: N) -> Option<&Entry> {
42        self.entries.iter().find(|&x| x.name == name.as_ref())
43    }
44
45    /// Returns the detected character encoding for text fields
46    /// (names, comments) inside this zip archive.
47    #[inline(always)]
48    pub fn encoding(&self) -> Encoding {
49        self.encoding
50    }
51
52    /// Returns the comment for this archive, if any. When reading
53    /// a zip file with an empty comment field, this will return None.
54    #[inline(always)]
55    pub fn comment(&self) -> &str {
56        &self.comment
57    }
58}
59
60/// Describes a zip archive entry (a file, a directory, a symlink)
61#[derive(Clone)]
62pub struct Entry {
63    /// Name of the file
64    ///
65    /// This should be a relative path, separated by `/`. However, there are zip
66    /// files in the wild with all sorts of evil variants, so, be conservative
67    /// in what you accept.
68    ///
69    /// See also [Self::sanitized_name], which returns a sanitized version of
70    /// the name, working around zip slip vulnerabilities.
71    pub name: String,
72
73    /// Compression method: Store, Deflate, Bzip2, etc.
74    pub method: Method,
75
76    /// Comment is any arbitrary user-defined string shorter than 64KiB
77    pub comment: String,
78
79    /// This entry's "last modified" timestamp - with caveats
80    ///
81    /// Due to the history of the ZIP file format, this may be inaccurate. It may be offset
82    /// by a few hours, if there is no extended timestamp information. It may have a resolution
83    /// as low as two seconds, if only MSDOS timestamps are present. It may default to the Unix
84    /// epoch, if something went really wrong.
85    ///
86    /// If you're reading this after the year 2038, or after the year 2108, godspeed.
87    pub modified: DateTime<Utc>,
88
89    /// This entry's "created" timestamp, if available.
90    ///
91    /// See [Self::modified] for caveats.
92    pub created: Option<DateTime<Utc>>,
93
94    /// This entry's "last accessed" timestamp, if available.
95    ///
96    /// See [Self::accessed] for caveats.
97    pub accessed: Option<DateTime<Utc>>,
98
99    /// Offset of the local file header in the zip file
100    ///
101    /// ```text
102    /// [optional non-zip data]
103    /// [local file header 1] <------ header_offset points here
104    /// [encryption header 1]
105    /// [file data 1]
106    /// [data descriptor 1]
107    /// ...
108    /// [central directory]
109    /// [optional zip64 end of central directory info]
110    /// [end of central directory record]
111    /// ```
112    pub header_offset: u64,
113
114    /// Version of zip needed to extract this archive.
115    pub reader_version: Version,
116
117    /// General purpose bit flag
118    ///
119    /// In the zip format, the most noteworthy flag (bit 11) is for UTF-8 names.
120    /// Other flags can indicate: encryption (unsupported), various compression
121    /// settings (depending on the [Method] used).
122    ///
123    /// For LZMA, general-purpose bit 1 denotes the EOS marker.
124    pub flags: u16,
125
126    /// Unix user ID
127    ///
128    /// Only present if a Unix extra field or New Unix extra field was found.
129    pub uid: Option<u32>,
130
131    /// Unix group ID
132    ///
133    /// Only present if a Unix extra field or New Unix extra field was found.
134    pub gid: Option<u32>,
135
136    /// CRC-32 hash as found in the central directory.
137    ///
138    /// Note that this may be zero, and the actual CRC32 might be in the local header, or (more
139    /// commonly) in the data descriptor instead.
140    pub crc32: u32,
141
142    /// Size in bytes, after compression
143    pub compressed_size: u64,
144
145    /// Size in bytes, before compression
146    ///
147    /// This will be zero for directories.
148    pub uncompressed_size: u64,
149
150    /// File mode.
151    pub mode: Mode,
152}
153
154impl Entry {
155    /// Returns a sanitized version of the entry's name, if it
156    /// seems safe. In particular, if this method feels like the
157    /// entry name is trying to do a zip slip (cf.
158    /// <https://snyk.io/research/zip-slip-vulnerability>), it'll return
159    /// None.
160    ///
161    /// Other than that, it will strip any leading slashes on non-Windows OSes.
162    pub fn sanitized_name(&self) -> Option<&str> {
163        let name = self.name.as_str();
164
165        // refuse entries with traversed/absolute path to mitigate zip slip
166        if name.contains("..") {
167            return None;
168        }
169
170        #[cfg(windows)]
171        {
172            if name.contains(":\\") || name.starts_with("\\") {
173                return None;
174            }
175            Some(name)
176        }
177
178        #[cfg(not(windows))]
179        {
180            // strip absolute prefix on entries pointing to root path
181            let mut entry_chars = name.chars();
182            let mut name = name;
183            while name.starts_with('/') {
184                entry_chars.next();
185                name = entry_chars.as_str()
186            }
187            Some(name)
188        }
189    }
190
191    /// Apply the extra field to the entry, updating its metadata.
192    pub(crate) fn set_extra_field(&mut self, ef: &ExtraField) {
193        match &ef {
194            ExtraField::Zip64(z64) => {
195                self.uncompressed_size = z64.uncompressed_size;
196                self.compressed_size = z64.compressed_size;
197                self.header_offset = z64.header_offset;
198            }
199            ExtraField::Timestamp(ts) => {
200                self.modified = Utc
201                    .timestamp_opt(ts.mtime as i64, 0)
202                    .single()
203                    .unwrap_or_else(zero_datetime);
204            }
205            ExtraField::Ntfs(nf) => {
206                for attr in &nf.attrs {
207                    // note: other attributes are unsupported
208                    if let NtfsAttr::Attr1(attr) = attr {
209                        self.modified = attr.mtime.to_datetime().unwrap_or_else(zero_datetime);
210                        self.created = attr.ctime.to_datetime();
211                        self.accessed = attr.atime.to_datetime();
212                    }
213                }
214            }
215            ExtraField::Unix(uf) => {
216                self.modified = Utc
217                    .timestamp_opt(uf.mtime as i64, 0)
218                    .single()
219                    .unwrap_or_else(zero_datetime);
220
221                if self.uid.is_none() {
222                    self.uid = Some(uf.uid as u32);
223                }
224
225                if self.gid.is_none() {
226                    self.gid = Some(uf.gid as u32);
227                }
228            }
229            ExtraField::NewUnix(uf) => {
230                self.uid = Some(uf.uid as u32);
231                self.gid = Some(uf.uid as u32);
232            }
233            _ => {}
234        };
235    }
236}
237
238/// The entry's file type: a directory, a file, or a symbolic link.
239#[derive(Debug, Eq, PartialEq)]
240pub enum EntryKind {
241    /// The entry is a directory
242    Directory,
243
244    /// The entry is a file
245    File,
246
247    /// The entry is a symbolic link
248    Symlink,
249}
250
251impl Entry {
252    /// Determine the kind of this entry based on its mode.
253    pub fn kind(&self) -> EntryKind {
254        if self.mode.has(Mode::SYMLINK) {
255            EntryKind::Symlink
256        } else if self.mode.has(Mode::DIR) {
257            EntryKind::Directory
258        } else {
259            EntryKind::File
260        }
261    }
262}
263
264/// Compression method used for a file entry.
265///
266/// In archives that follow [ISO/IEC 21320-1:2015](https://www.iso.org/standard/60101.html), only
267/// [Store][Method::Store] and [Deflate][Method::Deflate] should be used.
268///
269/// However, in the wild, it is not too uncommon to encounter [Bzip2][Method::Bzip2],
270/// [Lzma][Method::Lzma] or others.
271#[derive(
272    Debug, Clone, Copy, PartialEq, Eq, Hash, IntoOwned, ToOwned,
273)]
274#[repr(u16)]
275pub enum Method {
276    /// No compression is applied
277    Store = Self::STORE,
278
279    /// [DEFLATE (RFC 1951)](https://www.ietf.org/rfc/rfc1951.txt)
280    Deflate = Self::DEFLATE,
281
282    /// [DEFLATE64](https://deflate64.com/)
283    Deflate64 = Self::DEFLATE64,
284
285    /// [BZIP-2](https://github.com/dsnet/compress/blob/master/doc/bzip2-format.pdf)
286    Bzip2 = Self::BZIP2,
287
288    /// [LZMA](https://github.com/jljusten/LZMA-SDK/blob/master/DOC/lzma-specification.txt)
289    Lzma = Self::LZMA,
290
291    /// [zstd](https://datatracker.ietf.org/doc/html/rfc8878)
292    Zstd = Self::ZSTD,
293
294    /// [MP3](https://www.iso.org/obp/ui/#iso:std:iso-iec:11172:-3:ed-1:v1:en)
295    Mp3 = Self::MP3,
296
297    /// [XZ](https://tukaani.org/xz/xz-file-format.txt)
298    Xz = Self::XZ,
299
300    /// [JPEG](https://jpeg.org/jpeg/)
301    Jpeg = Self::JPEG,
302
303    /// [WavPack](https://www.wavpack.com/)
304    WavPack = Self::WAV_PACK,
305
306    /// [PPMd](https://en.wikipedia.org/wiki/Prediction_by_partial_matching)
307    Ppmd = Self::PPMD,
308
309    /// AE-x encryption marker (see Appendix E of appnote)
310    Aex = Self::AEX,
311
312    /// A compression method that isn't recognized by this crate.
313    Unrecognized(u16),
314}
315
316impl Method {
317    const STORE: u16 = 0;
318    const DEFLATE: u16 = 8;
319    const DEFLATE64: u16 = 9;
320    const BZIP2: u16 = 12;
321    const LZMA: u16 = 14;
322    const ZSTD: u16 = 93;
323    const MP3: u16 = 94;
324    const XZ: u16 = 95;
325    const JPEG: u16 = 96;
326    const WAV_PACK: u16 = 97;
327    const PPMD: u16 = 98;
328    const AEX: u16 = 99;
329
330    /// Parse a method from a byte slice
331    pub fn parser(i: &mut Partial<&[u8]>) -> PResult<Self> {
332        le_u16(i).map(From::from)
333    }
334}
335
336impl From<u16> for Method {
337    fn from(u: u16) -> Self {
338        match u {
339            Self::STORE => Self::Store,
340            Self::DEFLATE => Self::Deflate,
341            Self::DEFLATE64 => Self::Deflate64,
342            Self::BZIP2 => Self::Bzip2,
343            Self::LZMA => Self::Lzma,
344            Self::ZSTD => Self::Zstd,
345            Self::MP3 => Self::Mp3,
346            Self::XZ => Self::Xz,
347            Self::JPEG => Self::Jpeg,
348            Self::WAV_PACK => Self::WavPack,
349            Self::PPMD => Self::Ppmd,
350            Self::AEX => Self::Aex,
351            u => Self::Unrecognized(u),
352        }
353    }
354}
355
356impl From<Method> for u16 {
357    fn from(method: Method) -> Self {
358        match method {
359            Method::Store => Method::STORE,
360            Method::Deflate => Method::DEFLATE,
361            Method::Deflate64 => Method::DEFLATE64,
362            Method::Bzip2 => Method::BZIP2,
363            Method::Lzma => Method::LZMA,
364            Method::Zstd => Method::ZSTD,
365            Method::Mp3 => Method::MP3,
366            Method::Xz => Method::XZ,
367            Method::Jpeg => Method::JPEG,
368            Method::WavPack => Method::WAV_PACK,
369            Method::Ppmd => Method::PPMD,
370            Method::Aex => Method::AEX,
371            Method::Unrecognized(u) => u,
372        }
373    }
374}