rc_zip/parse/archive.rs
1use chrono::{offset::Utc, DateTime, TimeZone};
2use ownable::{IntoOwned, ToOwned};
3use winnow::{binary::le_u16, PResult, Partial};
4
5use crate::{
6 encoding::Encoding,
7 parse::{Mode, Version},
8};
9
10use super::{zero_datetime, ExtraField, NtfsAttr};
11
12/// An Archive contains general information about a zip file, along with a list
13/// of [entries][Entry].
14///
15/// It is obtained through a state machine like
16/// [ArchiveFsm](crate::fsm::ArchiveFsm), although end-users tend to use
17/// higher-level interfaces like
18/// [rc-zip-sync](https://crates.io/crates/rc-zip-sync) or
19/// [rc-zip-tokio](https://crates.io/crates/rc-zip-tokio).
20pub struct Archive {
21 pub(crate) size: u64,
22 pub(crate) encoding: Encoding,
23 pub(crate) entries: Vec<Entry>,
24 pub(crate) comment: String,
25}
26
27impl Archive {
28 /// The size of .zip file that was read, in bytes.
29 #[inline(always)]
30 pub fn size(&self) -> u64 {
31 self.size
32 }
33
34 /// Iterate over all files in this zip, read from the central directory.
35 pub fn entries(&self) -> impl Iterator<Item = &Entry> {
36 self.entries.iter()
37 }
38
39 /// Attempts to look up an entry by name. This is usually a bad idea,
40 /// as names aren't necessarily normalized in zip archives.
41 pub fn by_name<N: AsRef<str>>(&self, name: N) -> Option<&Entry> {
42 self.entries.iter().find(|&x| x.name == name.as_ref())
43 }
44
45 /// Returns the detected character encoding for text fields
46 /// (names, comments) inside this zip archive.
47 #[inline(always)]
48 pub fn encoding(&self) -> Encoding {
49 self.encoding
50 }
51
52 /// Returns the comment for this archive, if any. When reading
53 /// a zip file with an empty comment field, this will return None.
54 #[inline(always)]
55 pub fn comment(&self) -> &str {
56 &self.comment
57 }
58}
59
60/// Describes a zip archive entry (a file, a directory, a symlink)
61#[derive(Clone)]
62pub struct Entry {
63 /// Name of the file
64 ///
65 /// This should be a relative path, separated by `/`. However, there are zip
66 /// files in the wild with all sorts of evil variants, so, be conservative
67 /// in what you accept.
68 ///
69 /// See also [Self::sanitized_name], which returns a sanitized version of
70 /// the name, working around zip slip vulnerabilities.
71 pub name: String,
72
73 /// Compression method: Store, Deflate, Bzip2, etc.
74 pub method: Method,
75
76 /// Comment is any arbitrary user-defined string shorter than 64KiB
77 pub comment: String,
78
79 /// This entry's "last modified" timestamp - with caveats
80 ///
81 /// Due to the history of the ZIP file format, this may be inaccurate. It may be offset
82 /// by a few hours, if there is no extended timestamp information. It may have a resolution
83 /// as low as two seconds, if only MSDOS timestamps are present. It may default to the Unix
84 /// epoch, if something went really wrong.
85 ///
86 /// If you're reading this after the year 2038, or after the year 2108, godspeed.
87 pub modified: DateTime<Utc>,
88
89 /// This entry's "created" timestamp, if available.
90 ///
91 /// See [Self::modified] for caveats.
92 pub created: Option<DateTime<Utc>>,
93
94 /// This entry's "last accessed" timestamp, if available.
95 ///
96 /// See [Self::accessed] for caveats.
97 pub accessed: Option<DateTime<Utc>>,
98
99 /// Offset of the local file header in the zip file
100 ///
101 /// ```text
102 /// [optional non-zip data]
103 /// [local file header 1] <------ header_offset points here
104 /// [encryption header 1]
105 /// [file data 1]
106 /// [data descriptor 1]
107 /// ...
108 /// [central directory]
109 /// [optional zip64 end of central directory info]
110 /// [end of central directory record]
111 /// ```
112 pub header_offset: u64,
113
114 /// Version of zip needed to extract this archive.
115 pub reader_version: Version,
116
117 /// General purpose bit flag
118 ///
119 /// In the zip format, the most noteworthy flag (bit 11) is for UTF-8 names.
120 /// Other flags can indicate: encryption (unsupported), various compression
121 /// settings (depending on the [Method] used).
122 ///
123 /// For LZMA, general-purpose bit 1 denotes the EOS marker.
124 pub flags: u16,
125
126 /// Unix user ID
127 ///
128 /// Only present if a Unix extra field or New Unix extra field was found.
129 pub uid: Option<u32>,
130
131 /// Unix group ID
132 ///
133 /// Only present if a Unix extra field or New Unix extra field was found.
134 pub gid: Option<u32>,
135
136 /// CRC-32 hash as found in the central directory.
137 ///
138 /// Note that this may be zero, and the actual CRC32 might be in the local header, or (more
139 /// commonly) in the data descriptor instead.
140 pub crc32: u32,
141
142 /// Size in bytes, after compression
143 pub compressed_size: u64,
144
145 /// Size in bytes, before compression
146 ///
147 /// This will be zero for directories.
148 pub uncompressed_size: u64,
149
150 /// File mode.
151 pub mode: Mode,
152}
153
154impl Entry {
155 /// Returns a sanitized version of the entry's name, if it
156 /// seems safe. In particular, if this method feels like the
157 /// entry name is trying to do a zip slip (cf.
158 /// <https://snyk.io/research/zip-slip-vulnerability>), it'll return
159 /// None.
160 ///
161 /// Other than that, it will strip any leading slashes on non-Windows OSes.
162 pub fn sanitized_name(&self) -> Option<&str> {
163 let name = self.name.as_str();
164
165 // refuse entries with traversed/absolute path to mitigate zip slip
166 if name.contains("..") {
167 return None;
168 }
169
170 #[cfg(windows)]
171 {
172 if name.contains(":\\") || name.starts_with("\\") {
173 return None;
174 }
175 Some(name)
176 }
177
178 #[cfg(not(windows))]
179 {
180 // strip absolute prefix on entries pointing to root path
181 let mut entry_chars = name.chars();
182 let mut name = name;
183 while name.starts_with('/') {
184 entry_chars.next();
185 name = entry_chars.as_str()
186 }
187 Some(name)
188 }
189 }
190
191 /// Apply the extra field to the entry, updating its metadata.
192 pub(crate) fn set_extra_field(&mut self, ef: &ExtraField) {
193 match &ef {
194 ExtraField::Zip64(z64) => {
195 self.uncompressed_size = z64.uncompressed_size;
196 self.compressed_size = z64.compressed_size;
197 self.header_offset = z64.header_offset;
198 }
199 ExtraField::Timestamp(ts) => {
200 self.modified = Utc
201 .timestamp_opt(ts.mtime as i64, 0)
202 .single()
203 .unwrap_or_else(zero_datetime);
204 }
205 ExtraField::Ntfs(nf) => {
206 for attr in &nf.attrs {
207 // note: other attributes are unsupported
208 if let NtfsAttr::Attr1(attr) = attr {
209 self.modified = attr.mtime.to_datetime().unwrap_or_else(zero_datetime);
210 self.created = attr.ctime.to_datetime();
211 self.accessed = attr.atime.to_datetime();
212 }
213 }
214 }
215 ExtraField::Unix(uf) => {
216 self.modified = Utc
217 .timestamp_opt(uf.mtime as i64, 0)
218 .single()
219 .unwrap_or_else(zero_datetime);
220
221 if self.uid.is_none() {
222 self.uid = Some(uf.uid as u32);
223 }
224
225 if self.gid.is_none() {
226 self.gid = Some(uf.gid as u32);
227 }
228 }
229 ExtraField::NewUnix(uf) => {
230 self.uid = Some(uf.uid as u32);
231 self.gid = Some(uf.uid as u32);
232 }
233 _ => {}
234 };
235 }
236}
237
238/// The entry's file type: a directory, a file, or a symbolic link.
239#[derive(Clone, Copy, Debug, Eq, PartialEq)]
240pub enum EntryKind {
241 /// The entry is a directory
242 Directory,
243
244 /// The entry is a file
245 File,
246
247 /// The entry is a symbolic link
248 Symlink,
249}
250
251impl EntryKind {
252 /// Returns if this is a [`EntryKind::Directory`]
253 ///
254 /// ```
255 /// # use rc_zip::EntryKind;
256 /// assert!(EntryKind::Directory.is_dir());
257 /// ```
258 pub fn is_dir(self) -> bool {
259 self == Self::Directory
260 }
261
262 /// Returns if this is a [`EntryKind::File`]
263 ///
264 /// ```
265 /// # struct Archive;
266 /// # impl Archive {
267 /// # fn entries(self) -> std::vec::IntoIter<rc_zip::Entry> {
268 /// # Vec::new().into_iter()
269 /// # }
270 /// # }
271 /// # let archive = Archive;
272 /// for entry in archive.entries().filter(|e| e.kind().is_file()) {
273 /// // ...
274 /// }
275 /// ```
276 pub fn is_file(self) -> bool {
277 self == Self::File
278 }
279
280 /// Returns if this is a [`EntryKind::Symlink`]
281 ///
282 /// ```
283 /// # struct Archive;
284 /// # impl Archive {
285 /// # fn entries(self) -> Vec<rc_zip::Entry> {
286 /// # Vec::new()
287 /// # }
288 /// # }
289 /// # let archive = Archive;
290 /// for entry in archive.entries() {
291 /// if entry.kind().is_symlink() {
292 /// continue;
293 /// }
294 ///
295 /// // ...
296 /// }
297 /// ```
298 pub fn is_symlink(self) -> bool {
299 self == Self::Symlink
300 }
301}
302
303impl Entry {
304 /// Determine the kind of this entry based on its mode.
305 pub fn kind(&self) -> EntryKind {
306 if self.mode.has(Mode::SYMLINK) {
307 EntryKind::Symlink
308 } else if self.mode.has(Mode::DIR) {
309 EntryKind::Directory
310 } else {
311 EntryKind::File
312 }
313 }
314}
315
316/// Compression method used for a file entry.
317///
318/// In archives that follow [ISO/IEC 21320-1:2015](https://www.iso.org/standard/60101.html), only
319/// [Store][Method::Store] and [Deflate][Method::Deflate] should be used.
320///
321/// However, in the wild, it is not too uncommon to encounter [Bzip2][Method::Bzip2],
322/// [Lzma][Method::Lzma] or others.
323#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, IntoOwned, ToOwned)]
324#[repr(u16)]
325pub enum Method {
326 /// No compression is applied
327 Store = Self::STORE,
328
329 /// [DEFLATE (RFC 1951)](https://www.ietf.org/rfc/rfc1951.txt)
330 Deflate = Self::DEFLATE,
331
332 /// [DEFLATE64](https://deflate64.com/)
333 Deflate64 = Self::DEFLATE64,
334
335 /// [BZIP-2](https://github.com/dsnet/compress/blob/master/doc/bzip2-format.pdf)
336 Bzip2 = Self::BZIP2,
337
338 /// [LZMA](https://github.com/jljusten/LZMA-SDK/blob/master/DOC/lzma-specification.txt)
339 Lzma = Self::LZMA,
340
341 /// [zstd](https://datatracker.ietf.org/doc/html/rfc8878)
342 Zstd = Self::ZSTD,
343
344 /// [MP3](https://www.iso.org/obp/ui/#iso:std:iso-iec:11172:-3:ed-1:v1:en)
345 Mp3 = Self::MP3,
346
347 /// [XZ](https://tukaani.org/xz/xz-file-format.txt)
348 Xz = Self::XZ,
349
350 /// [JPEG](https://jpeg.org/jpeg/)
351 Jpeg = Self::JPEG,
352
353 /// [WavPack](https://www.wavpack.com/)
354 WavPack = Self::WAV_PACK,
355
356 /// [PPMd](https://en.wikipedia.org/wiki/Prediction_by_partial_matching)
357 Ppmd = Self::PPMD,
358
359 /// AE-x encryption marker (see Appendix E of appnote)
360 Aex = Self::AEX,
361
362 /// A compression method that isn't recognized by this crate.
363 Unrecognized(u16),
364}
365
366impl Method {
367 const STORE: u16 = 0;
368 const DEFLATE: u16 = 8;
369 const DEFLATE64: u16 = 9;
370 const BZIP2: u16 = 12;
371 const LZMA: u16 = 14;
372 const ZSTD: u16 = 93;
373 const MP3: u16 = 94;
374 const XZ: u16 = 95;
375 const JPEG: u16 = 96;
376 const WAV_PACK: u16 = 97;
377 const PPMD: u16 = 98;
378 const AEX: u16 = 99;
379
380 /// Parse a method from a byte slice
381 pub fn parser(i: &mut Partial<&[u8]>) -> PResult<Self> {
382 le_u16(i).map(From::from)
383 }
384}
385
386impl From<u16> for Method {
387 fn from(u: u16) -> Self {
388 match u {
389 Self::STORE => Self::Store,
390 Self::DEFLATE => Self::Deflate,
391 Self::DEFLATE64 => Self::Deflate64,
392 Self::BZIP2 => Self::Bzip2,
393 Self::LZMA => Self::Lzma,
394 Self::ZSTD => Self::Zstd,
395 Self::MP3 => Self::Mp3,
396 Self::XZ => Self::Xz,
397 Self::JPEG => Self::Jpeg,
398 Self::WAV_PACK => Self::WavPack,
399 Self::PPMD => Self::Ppmd,
400 Self::AEX => Self::Aex,
401 u => Self::Unrecognized(u),
402 }
403 }
404}
405
406impl From<Method> for u16 {
407 fn from(method: Method) -> Self {
408 match method {
409 Method::Store => Method::STORE,
410 Method::Deflate => Method::DEFLATE,
411 Method::Deflate64 => Method::DEFLATE64,
412 Method::Bzip2 => Method::BZIP2,
413 Method::Lzma => Method::LZMA,
414 Method::Zstd => Method::ZSTD,
415 Method::Mp3 => Method::MP3,
416 Method::Xz => Method::XZ,
417 Method::Jpeg => Method::JPEG,
418 Method::WavPack => Method::WAV_PACK,
419 Method::Ppmd => Method::PPMD,
420 Method::Aex => Method::AEX,
421 Method::Unrecognized(u) => u,
422 }
423 }
424}