1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
use chrono::{DateTime, Utc};
use num_enum::{FromPrimitive, IntoPrimitive};

use crate::{
    encoding::Encoding,
    parse::{ExtraField, Mode, Version},
};

/// An Archive contains general information about a zip files, along with a list
/// of [entries][StoredEntry].
///
/// It is obtained through a state machine like
/// [ArchiveFsm](crate::fsm::ArchiveFsm), although end-users tend to use
/// higher-levelr interfaces like
/// [rc-zip-sync](https://crates.io/crates/rc-zip-sync) or
/// [rc-zip-tokio](https://crates.io/crates/rc-zip-tokio).
pub struct Archive {
    pub(crate) size: u64,
    pub(crate) encoding: Encoding,
    pub(crate) entries: Vec<StoredEntry>,
    pub(crate) comment: Option<String>,
}

impl Archive {
    /// The size of .zip file that was read, in bytes.
    pub fn size(&self) -> u64 {
        self.size
    }

    /// Iterate over all files in this zip, read from the central directory.
    pub fn entries(&self) -> impl Iterator<Item = &StoredEntry> {
        self.entries.iter()
    }

    /// Attempts to look up an entry by name. This is usually a bad idea,
    /// as names aren't necessarily normalized in zip archives.
    pub fn by_name<N: AsRef<str>>(&self, name: N) -> Option<&StoredEntry> {
        self.entries.iter().find(|&x| x.name() == name.as_ref())
    }

    /// Returns the detected character encoding for text fields
    /// (names, comments) inside this zip archive.
    pub fn encoding(&self) -> Encoding {
        self.encoding
    }

    /// Returns the comment for this archive, if any. When reading
    /// a zip file with an empty comment field, this will return None.
    pub fn comment(&self) -> Option<&String> {
        self.comment.as_ref()
    }
}

/// Describes a zip archive entry (a file, a directory, a symlink)
///
/// `Entry` contains normalized metadata fields, that can be set when
/// writing a zip archive. Additional metadata, along with the information
/// required to extract an entry, are available in [StoredEntry][] instead.
#[derive(Clone)]
pub struct Entry {
    /// Name of the file
    /// Must be a relative path, not start with a drive letter (e.g. C:),
    /// and must use forward slashes instead of back slashes
    pub name: String,

    /// Compression method
    ///
    /// See [Method][] for more details.
    pub method: Method,

    /// Comment is any arbitrary user-defined string shorter than 64KiB
    pub comment: Option<String>,

    /// Modified timestamp
    pub modified: chrono::DateTime<chrono::offset::Utc>,

    /// Created timestamp
    pub created: Option<chrono::DateTime<chrono::offset::Utc>>,

    /// Accessed timestamp
    pub accessed: Option<chrono::DateTime<chrono::offset::Utc>>,
}

/// An entry as stored into an Archive. Contains additional metadata and offset information.
///
/// Whereas [Entry][] is archive-independent, [StoredEntry][] contains information that is tied to
/// a specific archive.
///
/// When reading archives, one deals with a list of [StoredEntry][], whereas when writing one, one
/// typically only specifies an [Entry][] and provides the entry's contents: fields like the CRC32
/// hash, uncompressed size, and compressed size are derived automatically from the input.
#[derive(Clone)]
pub struct StoredEntry {
    /// Archive-independent information
    ///
    /// This contains the entry's name, timestamps, comment, compression method.
    pub entry: Entry,

    /// Offset of the local file header in the zip file
    ///
    /// ```text
    /// [optional non-zip data]
    /// [local file header 1] <------ header_offset points here
    /// [encryption header 1]
    /// [file data 1]
    /// [data descriptor 1]
    /// ...
    /// [central directory]
    /// [optional zip64 end of central directory info]
    /// [end of central directory record]
    /// ```
    pub header_offset: u64,

    /// External attributes (zip)
    pub external_attrs: u32,

    /// Version of zip supported by the tool that crated this archive.
    pub creator_version: Version,

    /// Version of zip needed to extract this archive.
    pub reader_version: Version,

    /// General purpose bit flag
    ///
    /// In the zip format, the most noteworthy flag (bit 11) is for UTF-8 names.
    /// Other flags can indicate: encryption (unsupported), various compression
    /// settings (depending on the [Method] used).
    ///
    /// For LZMA, general-purpose bit 1 denotes the EOS marker.
    pub flags: u16,

    /// Unix user ID
    ///
    /// Only present if a Unix extra field or New Unix extra field was found.
    pub uid: Option<u32>,

    /// Unix group ID
    ///
    /// Only present if a Unix extra field or New Unix extra field was found.
    pub gid: Option<u32>,

    /// File mode
    pub mode: Mode,

    /// Any extra fields recognized while parsing the file.
    ///
    /// Most of these should be normalized and accessible as other fields,
    /// but they are also made available here raw.
    pub extra_fields: Vec<ExtraField>,

    /// These fields are cheap to clone and needed for entry readers,
    /// hence them being in a separate struct
    pub inner: StoredEntryInner,
}

/// Fields required to read an entry properly, typically cloned into owned entry
/// readers.
#[derive(Clone, Copy, Debug)]
pub struct StoredEntryInner {
    /// CRC-32 hash as found in the central directory.
    ///
    /// Note that this may be zero, and the actual CRC32 might be in the local header, or (more
    /// commonly) in the data descriptor instead.
    pub crc32: u32,

    /// Size in bytes, after compression
    pub compressed_size: u64,

    /// Size in bytes, before compression
    ///
    /// This will be zero for directories.
    pub uncompressed_size: u64,

    /// True if this entry was read from a zip64 archive
    pub is_zip64: bool,
}

impl StoredEntry {
    /// Returns the entry's name. See also
    /// [sanitized_name()](StoredEntry::sanitized_name), which returns a
    /// sanitized version of the name.
    ///
    /// This should be a relative path, separated by `/`. However, there are zip
    /// files in the wild with all sorts of evil variants, so, be conservative
    /// in what you accept.
    pub fn name(&self) -> &str {
        self.entry.name.as_ref()
    }

    /// Returns a sanitized version of the entry's name, if it
    /// seems safe. In particular, if this method feels like the
    /// entry name is trying to do a zip slip (cf.
    /// <https://snyk.io/research/zip-slip-vulnerability>), it'll return
    /// None.
    ///
    /// Other than that, it will strip any leading slashes on non-Windows OSes.
    pub fn sanitized_name(&self) -> Option<&str> {
        let name = self.name();

        // refuse entries with traversed/absolute path to mitigate zip slip
        if name.contains("..") {
            return None;
        }

        #[cfg(windows)]
        {
            if name.contains(":\\") || name.starts_with("\\") {
                return None;
            }
            Some(name)
        }

        #[cfg(not(windows))]
        {
            // strip absolute prefix on entries pointing to root path
            let mut entry_chars = name.chars();
            let mut name = name;
            while name.starts_with('/') {
                entry_chars.next();
                name = entry_chars.as_str()
            }
            Some(name)
        }
    }

    /// The entry's comment, if any.
    ///
    /// When reading a zip file, an empty comment results in None.
    pub fn comment(&self) -> Option<&str> {
        self.entry.comment.as_ref().map(|x| x.as_ref())
    }

    /// The compression method used for this entry
    #[inline(always)]
    pub fn method(&self) -> Method {
        self.entry.method
    }

    /// This entry's "last modified" timestamp - with caveats
    ///
    /// Due to the history of the ZIP file format, this may be inaccurate. It may be offset
    /// by a few hours, if there is no extended timestamp information. It may have a resolution
    /// as low as two seconds, if only MSDOS timestamps are present. It may default to the Unix
    /// epoch, if something went really wrong.
    ///
    /// If you're reading this after the year 2038, or after the year 2108, godspeed.
    #[inline(always)]
    pub fn modified(&self) -> DateTime<Utc> {
        self.entry.modified
    }

    /// This entry's "created" timestamp, if available.
    ///
    /// See [StoredEntry::modified()] for caveats.
    #[inline(always)]
    pub fn created(&self) -> Option<&DateTime<Utc>> {
        self.entry.created.as_ref()
    }

    /// This entry's "last accessed" timestamp, if available.
    ///
    /// See [StoredEntry::modified()] for caveats.
    #[inline(always)]
    pub fn accessed(&self) -> Option<&DateTime<Utc>> {
        self.entry.accessed.as_ref()
    }
}

/// The contents of an entry: a directory, a file, or a symbolic link.
#[derive(Debug)]
pub enum EntryContents {
    /// The entry is a directory
    Directory,

    /// The entry is a file
    File,

    /// The entry is a symbolic link
    Symlink,
}

impl StoredEntry {
    /// Determine [EntryContents] of this entry based on its mode.
    pub fn contents(&self) -> EntryContents {
        if self.mode.has(Mode::SYMLINK) {
            EntryContents::Symlink
        } else if self.mode.has(Mode::DIR) {
            EntryContents::Directory
        } else {
            EntryContents::File
        }
    }
}

/// Compression method used for a file entry.
///
/// In archives that follow [ISO/IEC 21320-1:2015](https://www.iso.org/standard/60101.html), only
/// [Store][Method::Store] and [Deflate][Method::Deflate] should be used.
///
/// However, in the wild, it is not too uncommon to encounter [Bzip2][Method::Bzip2],
/// [Lzma][Method::Lzma] or others.
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, IntoPrimitive, FromPrimitive)]
#[repr(u16)]
pub enum Method {
    /// No compression is applied
    Store = 0,

    /// [DEFLATE (RFC 1951)](https://www.ietf.org/rfc/rfc1951.txt)
    Deflate = 8,

    /// [DEFLATE64](https://deflate64.com/)
    Deflate64 = 9,

    /// [BZIP-2](https://github.com/dsnet/compress/blob/master/doc/bzip2-format.pdf)
    Bzip2 = 12,

    /// [LZMA](https://github.com/jljusten/LZMA-SDK/blob/master/DOC/lzma-specification.txt)
    Lzma = 14,

    /// [zstd](https://datatracker.ietf.org/doc/html/rfc8878)
    Zstd = 93,

    /// [MP3](https://www.iso.org/obp/ui/#iso:std:iso-iec:11172:-3:ed-1:v1:en)
    Mp3 = 94,

    /// [XZ](https://tukaani.org/xz/xz-file-format.txt)
    Xz = 95,

    /// [JPEG](https://jpeg.org/jpeg/)
    Jpeg = 96,

    /// [WavPack](https://www.wavpack.com/)
    WavPack = 97,

    /// [PPMd](https://en.wikipedia.org/wiki/Prediction_by_partial_matching)
    Ppmd = 98,

    /// AE-x encryption marker (see Appendix E of appnote)
    Aex = 99,

    /// A compression method that isn't recognized by this crate.
    #[num_enum(catch_all)]
    Unrecognized(u16),
}