rc_zip/parse/
archive.rs

1use chrono::{offset::Utc, DateTime, TimeZone};
2use ownable::{IntoOwned, ToOwned};
3use winnow::{binary::le_u16, PResult, Partial};
4
5use crate::{
6    encoding::Encoding,
7    parse::{Mode, Version},
8};
9
10use super::{zero_datetime, ExtraField, NtfsAttr};
11
12/// An Archive contains general information about a zip file, along with a list
13/// of [entries][Entry].
14///
15/// It is obtained through a state machine like
16/// [ArchiveFsm](crate::fsm::ArchiveFsm), although end-users tend to use
17/// higher-level interfaces like
18/// [rc-zip-sync](https://crates.io/crates/rc-zip-sync) or
19/// [rc-zip-tokio](https://crates.io/crates/rc-zip-tokio).
20pub struct Archive {
21    pub(crate) size: u64,
22    pub(crate) encoding: Encoding,
23    pub(crate) entries: Vec<Entry>,
24    pub(crate) comment: String,
25}
26
27impl Archive {
28    /// The size of .zip file that was read, in bytes.
29    #[inline(always)]
30    pub fn size(&self) -> u64 {
31        self.size
32    }
33
34    /// Iterate over all files in this zip, read from the central directory.
35    pub fn entries(&self) -> impl Iterator<Item = &Entry> {
36        self.entries.iter()
37    }
38
39    /// Attempts to look up an entry by name. This is usually a bad idea,
40    /// as names aren't necessarily normalized in zip archives.
41    pub fn by_name<N: AsRef<str>>(&self, name: N) -> Option<&Entry> {
42        self.entries.iter().find(|&x| x.name == name.as_ref())
43    }
44
45    /// Returns the detected character encoding for text fields
46    /// (names, comments) inside this zip archive.
47    #[inline(always)]
48    pub fn encoding(&self) -> Encoding {
49        self.encoding
50    }
51
52    /// Returns the comment for this archive, if any. When reading
53    /// a zip file with an empty comment field, this will return None.
54    #[inline(always)]
55    pub fn comment(&self) -> &str {
56        &self.comment
57    }
58}
59
60/// Describes a zip archive entry (a file, a directory, a symlink)
61#[derive(Clone)]
62pub struct Entry {
63    /// Name of the file
64    ///
65    /// This should be a relative path, separated by `/`. However, there are zip
66    /// files in the wild with all sorts of evil variants, so, be conservative
67    /// in what you accept.
68    ///
69    /// See also [Self::sanitized_name], which returns a sanitized version of
70    /// the name, working around zip slip vulnerabilities.
71    pub name: String,
72
73    /// Compression method: Store, Deflate, Bzip2, etc.
74    pub method: Method,
75
76    /// Comment is any arbitrary user-defined string shorter than 64KiB
77    pub comment: String,
78
79    /// This entry's "last modified" timestamp - with caveats
80    ///
81    /// Due to the history of the ZIP file format, this may be inaccurate. It may be offset
82    /// by a few hours, if there is no extended timestamp information. It may have a resolution
83    /// as low as two seconds, if only MSDOS timestamps are present. It may default to the Unix
84    /// epoch, if something went really wrong.
85    ///
86    /// If you're reading this after the year 2038, or after the year 2108, godspeed.
87    pub modified: DateTime<Utc>,
88
89    /// This entry's "created" timestamp, if available.
90    ///
91    /// See [Self::modified] for caveats.
92    pub created: Option<DateTime<Utc>>,
93
94    /// This entry's "last accessed" timestamp, if available.
95    ///
96    /// See [Self::accessed] for caveats.
97    pub accessed: Option<DateTime<Utc>>,
98
99    /// Offset of the local file header in the zip file
100    ///
101    /// ```text
102    /// [optional non-zip data]
103    /// [local file header 1] <------ header_offset points here
104    /// [encryption header 1]
105    /// [file data 1]
106    /// [data descriptor 1]
107    /// ...
108    /// [central directory]
109    /// [optional zip64 end of central directory info]
110    /// [end of central directory record]
111    /// ```
112    pub header_offset: u64,
113
114    /// Version of zip needed to extract this archive.
115    pub reader_version: Version,
116
117    /// General purpose bit flag
118    ///
119    /// In the zip format, the most noteworthy flag (bit 11) is for UTF-8 names.
120    /// Other flags can indicate: encryption (unsupported), various compression
121    /// settings (depending on the [Method] used).
122    ///
123    /// For LZMA, general-purpose bit 1 denotes the EOS marker.
124    pub flags: u16,
125
126    /// Unix user ID
127    ///
128    /// Only present if a Unix extra field or New Unix extra field was found.
129    pub uid: Option<u32>,
130
131    /// Unix group ID
132    ///
133    /// Only present if a Unix extra field or New Unix extra field was found.
134    pub gid: Option<u32>,
135
136    /// CRC-32 hash as found in the central directory.
137    ///
138    /// Note that this may be zero, and the actual CRC32 might be in the local header, or (more
139    /// commonly) in the data descriptor instead.
140    pub crc32: u32,
141
142    /// Size in bytes, after compression
143    pub compressed_size: u64,
144
145    /// Size in bytes, before compression
146    ///
147    /// This will be zero for directories.
148    pub uncompressed_size: u64,
149
150    /// File mode.
151    pub mode: Mode,
152}
153
154impl Entry {
155    /// Returns a sanitized version of the entry's name, if it
156    /// seems safe. In particular, if this method feels like the
157    /// entry name is trying to do a zip slip (cf.
158    /// <https://snyk.io/research/zip-slip-vulnerability>), it'll return
159    /// None.
160    ///
161    /// Other than that, it will strip any leading slashes on non-Windows OSes.
162    pub fn sanitized_name(&self) -> Option<&str> {
163        let name = self.name.as_str();
164
165        // refuse entries with traversed/absolute path to mitigate zip slip
166        if name.contains("..") {
167            return None;
168        }
169
170        #[cfg(windows)]
171        {
172            if name.contains(":\\") || name.starts_with("\\") {
173                return None;
174            }
175            Some(name)
176        }
177
178        #[cfg(not(windows))]
179        {
180            // strip absolute prefix on entries pointing to root path
181            let mut entry_chars = name.chars();
182            let mut name = name;
183            while name.starts_with('/') {
184                entry_chars.next();
185                name = entry_chars.as_str()
186            }
187            Some(name)
188        }
189    }
190
191    /// Apply the extra field to the entry, updating its metadata.
192    pub(crate) fn set_extra_field(&mut self, ef: &ExtraField) {
193        match &ef {
194            ExtraField::Zip64(z64) => {
195                self.uncompressed_size = z64.uncompressed_size;
196                self.compressed_size = z64.compressed_size;
197                self.header_offset = z64.header_offset;
198            }
199            ExtraField::Timestamp(ts) => {
200                self.modified = Utc
201                    .timestamp_opt(ts.mtime as i64, 0)
202                    .single()
203                    .unwrap_or_else(zero_datetime);
204            }
205            ExtraField::Ntfs(nf) => {
206                for attr in &nf.attrs {
207                    // note: other attributes are unsupported
208                    if let NtfsAttr::Attr1(attr) = attr {
209                        self.modified = attr.mtime.to_datetime().unwrap_or_else(zero_datetime);
210                        self.created = attr.ctime.to_datetime();
211                        self.accessed = attr.atime.to_datetime();
212                    }
213                }
214            }
215            ExtraField::Unix(uf) => {
216                self.modified = Utc
217                    .timestamp_opt(uf.mtime as i64, 0)
218                    .single()
219                    .unwrap_or_else(zero_datetime);
220
221                if self.uid.is_none() {
222                    self.uid = Some(uf.uid as u32);
223                }
224
225                if self.gid.is_none() {
226                    self.gid = Some(uf.gid as u32);
227                }
228            }
229            ExtraField::NewUnix(uf) => {
230                self.uid = Some(uf.uid as u32);
231                self.gid = Some(uf.uid as u32);
232            }
233            _ => {}
234        };
235    }
236}
237
238/// The entry's file type: a directory, a file, or a symbolic link.
239#[derive(Clone, Copy, Debug, Eq, PartialEq)]
240pub enum EntryKind {
241    /// The entry is a directory
242    Directory,
243
244    /// The entry is a file
245    File,
246
247    /// The entry is a symbolic link
248    Symlink,
249}
250
251impl EntryKind {
252    /// Returns if this is a [`EntryKind::Directory`]
253    ///
254    /// ```
255    /// # use rc_zip::EntryKind;
256    /// assert!(EntryKind::Directory.is_dir());
257    /// ```
258    pub fn is_dir(self) -> bool {
259        self == Self::Directory
260    }
261
262    /// Returns if this is a [`EntryKind::File`]
263    ///
264    /// ```
265    /// # struct Archive;
266    /// # impl Archive {
267    /// #     fn entries(self) -> std::vec::IntoIter<rc_zip::Entry> {
268    /// #         Vec::new().into_iter()
269    /// #     }
270    /// # }
271    /// # let archive = Archive;
272    /// for entry in archive.entries().filter(|e| e.kind().is_file()) {
273    ///     // ...
274    /// }
275    /// ```
276    pub fn is_file(self) -> bool {
277        self == Self::File
278    }
279
280    /// Returns if this is a [`EntryKind::Symlink`]
281    ///
282    /// ```
283    /// # struct Archive;
284    /// # impl Archive {
285    /// #     fn entries(self) -> Vec<rc_zip::Entry> {
286    /// #         Vec::new()
287    /// #     }
288    /// # }
289    /// # let archive = Archive;
290    /// for entry in archive.entries() {
291    ///     if entry.kind().is_symlink() {
292    ///         continue;
293    ///     }
294    ///
295    ///     // ...
296    /// }
297    /// ```
298    pub fn is_symlink(self) -> bool {
299        self == Self::Symlink
300    }
301}
302
303impl Entry {
304    /// Determine the kind of this entry based on its mode.
305    pub fn kind(&self) -> EntryKind {
306        if self.mode.has(Mode::SYMLINK) {
307            EntryKind::Symlink
308        } else if self.mode.has(Mode::DIR) {
309            EntryKind::Directory
310        } else {
311            EntryKind::File
312        }
313    }
314}
315
316/// Compression method used for a file entry.
317///
318/// In archives that follow [ISO/IEC 21320-1:2015](https://www.iso.org/standard/60101.html), only
319/// [Store][Method::Store] and [Deflate][Method::Deflate] should be used.
320///
321/// However, in the wild, it is not too uncommon to encounter [Bzip2][Method::Bzip2],
322/// [Lzma][Method::Lzma] or others.
323#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, IntoOwned, ToOwned)]
324#[repr(u16)]
325pub enum Method {
326    /// No compression is applied
327    Store = Self::STORE,
328
329    /// [DEFLATE (RFC 1951)](https://www.ietf.org/rfc/rfc1951.txt)
330    Deflate = Self::DEFLATE,
331
332    /// [DEFLATE64](https://deflate64.com/)
333    Deflate64 = Self::DEFLATE64,
334
335    /// [BZIP-2](https://github.com/dsnet/compress/blob/master/doc/bzip2-format.pdf)
336    Bzip2 = Self::BZIP2,
337
338    /// [LZMA](https://github.com/jljusten/LZMA-SDK/blob/master/DOC/lzma-specification.txt)
339    Lzma = Self::LZMA,
340
341    /// [zstd](https://datatracker.ietf.org/doc/html/rfc8878)
342    Zstd = Self::ZSTD,
343
344    /// [MP3](https://www.iso.org/obp/ui/#iso:std:iso-iec:11172:-3:ed-1:v1:en)
345    Mp3 = Self::MP3,
346
347    /// [XZ](https://tukaani.org/xz/xz-file-format.txt)
348    Xz = Self::XZ,
349
350    /// [JPEG](https://jpeg.org/jpeg/)
351    Jpeg = Self::JPEG,
352
353    /// [WavPack](https://www.wavpack.com/)
354    WavPack = Self::WAV_PACK,
355
356    /// [PPMd](https://en.wikipedia.org/wiki/Prediction_by_partial_matching)
357    Ppmd = Self::PPMD,
358
359    /// AE-x encryption marker (see Appendix E of appnote)
360    Aex = Self::AEX,
361
362    /// A compression method that isn't recognized by this crate.
363    Unrecognized(u16),
364}
365
366impl Method {
367    const STORE: u16 = 0;
368    const DEFLATE: u16 = 8;
369    const DEFLATE64: u16 = 9;
370    const BZIP2: u16 = 12;
371    const LZMA: u16 = 14;
372    const ZSTD: u16 = 93;
373    const MP3: u16 = 94;
374    const XZ: u16 = 95;
375    const JPEG: u16 = 96;
376    const WAV_PACK: u16 = 97;
377    const PPMD: u16 = 98;
378    const AEX: u16 = 99;
379
380    /// Parse a method from a byte slice
381    pub fn parser(i: &mut Partial<&[u8]>) -> PResult<Self> {
382        le_u16(i).map(From::from)
383    }
384}
385
386impl From<u16> for Method {
387    fn from(u: u16) -> Self {
388        match u {
389            Self::STORE => Self::Store,
390            Self::DEFLATE => Self::Deflate,
391            Self::DEFLATE64 => Self::Deflate64,
392            Self::BZIP2 => Self::Bzip2,
393            Self::LZMA => Self::Lzma,
394            Self::ZSTD => Self::Zstd,
395            Self::MP3 => Self::Mp3,
396            Self::XZ => Self::Xz,
397            Self::JPEG => Self::Jpeg,
398            Self::WAV_PACK => Self::WavPack,
399            Self::PPMD => Self::Ppmd,
400            Self::AEX => Self::Aex,
401            u => Self::Unrecognized(u),
402        }
403    }
404}
405
406impl From<Method> for u16 {
407    fn from(method: Method) -> Self {
408        match method {
409            Method::Store => Method::STORE,
410            Method::Deflate => Method::DEFLATE,
411            Method::Deflate64 => Method::DEFLATE64,
412            Method::Bzip2 => Method::BZIP2,
413            Method::Lzma => Method::LZMA,
414            Method::Zstd => Method::ZSTD,
415            Method::Mp3 => Method::MP3,
416            Method::Xz => Method::XZ,
417            Method::Jpeg => Method::JPEG,
418            Method::WavPack => Method::WAV_PACK,
419            Method::Ppmd => Method::PPMD,
420            Method::Aex => Method::AEX,
421            Method::Unrecognized(u) => u,
422        }
423    }
424}