async_zip/base/read/
mod.rs

1// Copyright (c) 2022-2023 Harry [Majored] [hello@majored.pw]
2// MIT License (https://github.com/Majored/rs-async-zip/blob/main/LICENSE)
3
4//! A module which supports reading ZIP files.
5
6pub mod mem;
7pub mod seek;
8pub mod stream;
9
10pub mod cd;
11mod counting;
12pub(crate) mod io;
13
14use crate::ZipString;
15// Re-exported as part of the public API.
16pub use crate::base::read::io::entry::WithEntry;
17pub use crate::base::read::io::entry::WithoutEntry;
18pub use crate::base::read::io::entry::ZipEntryReader;
19
20use crate::date::ZipDateTime;
21use crate::entry::{StoredZipEntry, ZipEntry};
22use crate::error::{Result, ZipError};
23use crate::file::ZipFile;
24use crate::spec::attribute::AttributeCompatibility;
25use crate::spec::consts::LFH_LENGTH;
26use crate::spec::consts::{CDH_SIGNATURE, LFH_SIGNATURE, NON_ZIP64_MAX_SIZE, SIGNATURE_LENGTH, ZIP64_EOCDL_LENGTH};
27use crate::spec::header::InfoZipUnicodeCommentExtraField;
28use crate::spec::header::InfoZipUnicodePathExtraField;
29use crate::spec::header::{
30    CentralDirectoryRecord, EndOfCentralDirectoryHeader, ExtraField, LocalFileHeader,
31    Zip64EndOfCentralDirectoryLocator, Zip64EndOfCentralDirectoryRecord, Zip64ExtendedInformationExtraField,
32};
33use crate::spec::Compression;
34use crate::string::StringEncoding;
35
36use crate::base::read::io::CombinedCentralDirectoryRecord;
37use crate::spec::parse::parse_extra_fields;
38
39use futures_lite::io::{AsyncRead, AsyncReadExt, AsyncSeek, AsyncSeekExt, BufReader, SeekFrom};
40
41/// The max buffer size used when parsing the central directory, equal to 20MiB.
42const MAX_CD_BUFFER_SIZE: usize = 20 * 1024 * 1024;
43
44pub(crate) async fn file<R>(mut reader: R) -> Result<ZipFile>
45where
46    R: AsyncRead + AsyncSeek + Unpin,
47{
48    // First find and parse the EOCDR.
49    let eocdr_offset = crate::base::read::io::locator::eocdr(&mut reader).await?;
50
51    reader.seek(SeekFrom::Start(eocdr_offset)).await?;
52    let eocdr = EndOfCentralDirectoryHeader::from_reader(&mut reader).await?;
53
54    let comment = io::read_string(&mut reader, eocdr.file_comm_length.into(), crate::StringEncoding::Utf8).await?;
55
56    // Check the 20 bytes before the EOCDR for the Zip64 EOCDL, plus an extra 4 bytes because the offset
57    // does not include the signature. If the ECODL exists we are dealing with a Zip64 file.
58    let (eocdr, zip64) = match eocdr_offset.checked_sub(ZIP64_EOCDL_LENGTH + SIGNATURE_LENGTH as u64) {
59        None => (CombinedCentralDirectoryRecord::from(&eocdr), false),
60        Some(offset) => {
61            reader.seek(SeekFrom::Start(offset)).await?;
62            let zip64_locator = Zip64EndOfCentralDirectoryLocator::try_from_reader(&mut reader).await?;
63
64            match zip64_locator {
65                Some(locator) => {
66                    reader.seek(SeekFrom::Start(locator.relative_offset + SIGNATURE_LENGTH as u64)).await?;
67                    let zip64_eocdr = Zip64EndOfCentralDirectoryRecord::from_reader(&mut reader).await?;
68                    (CombinedCentralDirectoryRecord::combine(eocdr, zip64_eocdr), true)
69                }
70                None => (CombinedCentralDirectoryRecord::from(&eocdr), false),
71            }
72        }
73    };
74
75    // Outdated feature so unlikely to ever make it into this crate.
76    if eocdr.disk_number != eocdr.disk_number_start_of_cd
77        || eocdr.num_entries_in_directory != eocdr.num_entries_in_directory_on_disk
78    {
79        return Err(ZipError::FeatureNotSupported("Spanned/split files"));
80    }
81
82    // Find and parse the central directory.
83    reader.seek(SeekFrom::Start(eocdr.offset_of_start_of_directory)).await?;
84
85    // To avoid lots of small reads to `reader` when parsing the central directory, we use a BufReader that can read the whole central directory at once.
86    // Because `eocdr.offset_of_start_of_directory` is a u64, we use MAX_CD_BUFFER_SIZE to prevent very large buffer sizes.
87    let buf =
88        BufReader::with_capacity(std::cmp::min(eocdr.offset_of_start_of_directory as _, MAX_CD_BUFFER_SIZE), reader);
89    let entries = crate::base::read::cd(buf, eocdr.num_entries_in_directory, zip64).await?;
90
91    Ok(ZipFile { entries, comment, zip64 })
92}
93
94pub(crate) async fn cd<R>(mut reader: R, num_of_entries: u64, zip64: bool) -> Result<Vec<StoredZipEntry>>
95where
96    R: AsyncRead + Unpin,
97{
98    let num_of_entries = num_of_entries.try_into().map_err(|_| ZipError::TargetZip64NotSupported)?;
99    let mut entries = Vec::with_capacity(num_of_entries);
100
101    for _ in 0..num_of_entries {
102        let entry = cd_record(&mut reader, zip64).await?;
103        entries.push(entry);
104    }
105
106    Ok(entries)
107}
108
109pub(crate) fn get_zip64_extra_field(extra_fields: &[ExtraField]) -> Option<&Zip64ExtendedInformationExtraField> {
110    for field in extra_fields {
111        if let ExtraField::Zip64ExtendedInformation(zip64field) = field {
112            return Some(zip64field);
113        }
114    }
115    None
116}
117
118fn get_combined_sizes(
119    uncompressed_size: u32,
120    compressed_size: u32,
121    extra_field: &Option<&Zip64ExtendedInformationExtraField>,
122) -> Result<(u64, u64)> {
123    let mut uncompressed_size = uncompressed_size as u64;
124    let mut compressed_size = compressed_size as u64;
125
126    if let Some(extra_field) = extra_field {
127        if let Some(s) = extra_field.uncompressed_size {
128            if uncompressed_size == NON_ZIP64_MAX_SIZE as u64 {
129                uncompressed_size = s;
130            }
131        }
132        if let Some(s) = extra_field.compressed_size {
133            if compressed_size == NON_ZIP64_MAX_SIZE as u64 {
134                compressed_size = s;
135            }
136        }
137    }
138
139    Ok((uncompressed_size, compressed_size))
140}
141
142pub(crate) async fn cd_record<R>(mut reader: R, _zip64: bool) -> Result<StoredZipEntry>
143where
144    R: AsyncRead + Unpin,
145{
146    crate::utils::assert_signature(&mut reader, CDH_SIGNATURE).await?;
147
148    let header = CentralDirectoryRecord::from_reader(&mut reader).await?;
149    let header_size = (SIGNATURE_LENGTH + LFH_LENGTH) as u64;
150    let trailing_size = header.file_name_length as u64 + header.extra_field_length as u64;
151    let filename_basic = io::read_bytes(&mut reader, header.file_name_length.into()).await?;
152    let compression = Compression::try_from(header.compression)?;
153    let extra_field = io::read_bytes(&mut reader, header.extra_field_length.into()).await?;
154    let extra_fields = parse_extra_fields(
155        extra_field,
156        header.uncompressed_size,
157        header.compressed_size,
158        Some(header.lh_offset),
159        Some(header.disk_start),
160    )?;
161    let comment_basic = io::read_bytes(reader, header.file_comment_length.into()).await?;
162
163    let zip64_extra_field = get_zip64_extra_field(&extra_fields);
164    let (uncompressed_size, compressed_size) =
165        get_combined_sizes(header.uncompressed_size, header.compressed_size, &zip64_extra_field)?;
166
167    let mut file_offset = header.lh_offset as u64;
168    if let Some(zip64_extra_field) = zip64_extra_field {
169        if file_offset == NON_ZIP64_MAX_SIZE as u64 {
170            if let Some(offset) = zip64_extra_field.relative_header_offset {
171                file_offset = offset;
172            }
173        }
174    }
175
176    let filename = detect_filename(filename_basic, header.flags.filename_unicode, extra_fields.as_ref());
177    let comment = detect_comment(comment_basic, header.flags.filename_unicode, extra_fields.as_ref());
178
179    let entry = ZipEntry {
180        filename,
181        compression,
182        #[cfg(any(
183            feature = "deflate",
184            feature = "bzip2",
185            feature = "zstd",
186            feature = "lzma",
187            feature = "xz",
188            feature = "deflate64"
189        ))]
190        compression_level: async_compression::Level::Default,
191        attribute_compatibility: AttributeCompatibility::Unix,
192        // FIXME: Default to Unix for the moment
193        crc32: header.crc,
194        uncompressed_size,
195        compressed_size,
196        last_modification_date: ZipDateTime { date: header.mod_date, time: header.mod_time },
197        internal_file_attribute: header.inter_attr,
198        external_file_attribute: header.exter_attr,
199        extra_fields,
200        comment,
201        data_descriptor: header.flags.data_descriptor,
202        file_offset,
203    };
204
205    Ok(StoredZipEntry { entry, file_offset, header_size: header_size + trailing_size })
206}
207
208pub(crate) async fn lfh<R>(mut reader: R, file_offset: u64) -> Result<Option<ZipEntry>>
209where
210    R: AsyncRead + Unpin,
211{
212    let signature = {
213        let mut buffer = [0; 4];
214        reader.read_exact(&mut buffer).await?;
215        u32::from_le_bytes(buffer)
216    };
217    match signature {
218        actual if actual == LFH_SIGNATURE => (),
219        actual if actual == CDH_SIGNATURE => return Ok(None),
220        actual => return Err(ZipError::UnexpectedHeaderError(actual, LFH_SIGNATURE)),
221    };
222
223    let header = LocalFileHeader::from_reader(&mut reader).await?;
224    let filename_basic = io::read_bytes(&mut reader, header.file_name_length.into()).await?;
225    let compression = Compression::try_from(header.compression)?;
226    let extra_field = io::read_bytes(&mut reader, header.extra_field_length.into()).await?;
227    let extra_fields = parse_extra_fields(extra_field, header.uncompressed_size, header.compressed_size, None, None)?;
228
229    let zip64_extra_field = get_zip64_extra_field(&extra_fields);
230    let (uncompressed_size, compressed_size) =
231        get_combined_sizes(header.uncompressed_size, header.compressed_size, &zip64_extra_field)?;
232
233    if header.flags.data_descriptor && compression == Compression::Stored {
234        return Err(ZipError::FeatureNotSupported(
235            "stream reading entries with data descriptors & Stored compression mode",
236        ));
237    }
238    if header.flags.encrypted {
239        return Err(ZipError::FeatureNotSupported("encryption"));
240    }
241
242    let filename = detect_filename(filename_basic, header.flags.filename_unicode, extra_fields.as_ref());
243
244    let entry = ZipEntry {
245        filename,
246        compression,
247        #[cfg(any(
248            feature = "deflate",
249            feature = "bzip2",
250            feature = "zstd",
251            feature = "lzma",
252            feature = "xz",
253            feature = "deflate64"
254        ))]
255        compression_level: async_compression::Level::Default,
256        attribute_compatibility: AttributeCompatibility::Unix,
257        // FIXME: Default to Unix for the moment
258        crc32: header.crc,
259        uncompressed_size,
260        compressed_size,
261        last_modification_date: ZipDateTime { date: header.mod_date, time: header.mod_time },
262        internal_file_attribute: 0,
263        external_file_attribute: 0,
264        extra_fields,
265        comment: String::new().into(),
266        data_descriptor: header.flags.data_descriptor,
267        file_offset,
268    };
269
270    Ok(Some(entry))
271}
272
273fn detect_comment(basic: Vec<u8>, basic_is_utf8: bool, extra_fields: &[ExtraField]) -> ZipString {
274    if basic_is_utf8 {
275        ZipString::new(basic, StringEncoding::Utf8)
276    } else {
277        let unicode_extra = extra_fields.iter().find_map(|field| match field {
278            ExtraField::InfoZipUnicodeComment(InfoZipUnicodeCommentExtraField::V1 { crc32, unicode }) => {
279                if *crc32 == crc32fast::hash(&basic) {
280                    Some(std::string::String::from_utf8(unicode.clone()))
281                } else {
282                    None
283                }
284            }
285            _ => None,
286        });
287        if let Some(Ok(s)) = unicode_extra {
288            ZipString::new_with_alternative(s, basic)
289        } else {
290            // Do not treat as UTF-8 if UTF-8 flags are not set,
291            // some string in MBCS may be valid UTF-8 in form, but they are not in truth.
292            if basic.is_ascii() {
293                // SAFETY:
294                // a valid ASCII string is always a valid UTF-8 string
295                unsafe { std::string::String::from_utf8_unchecked(basic).into() }
296            } else {
297                ZipString::new(basic, StringEncoding::Raw)
298            }
299        }
300    }
301}
302
303fn detect_filename(basic: Vec<u8>, basic_is_utf8: bool, extra_fields: &[ExtraField]) -> ZipString {
304    let unicode_extra = extra_fields.iter().find_map(|field| match field {
305        ExtraField::InfoZipUnicodePath(InfoZipUnicodePathExtraField::V1 { crc32, unicode }) => {
306            if !unicode.is_empty() && *crc32 == crc32fast::hash(&basic) {
307                Some(std::string::String::from_utf8(unicode.clone()))
308            } else {
309                None
310            }
311        }
312        _ => None,
313    });
314    if let Some(Ok(s)) = unicode_extra {
315        ZipString::new_with_alternative(s, basic)
316    } else if basic_is_utf8 {
317        ZipString::new(basic, StringEncoding::Utf8)
318    } else {
319        // Do not treat as UTF-8 if UTF-8 flags are not set,
320        // some string in MBCS may be valid UTF-8 in form, but they are not in truth.
321        if basic.is_ascii() {
322            // SAFETY:
323            // a valid ASCII string is always a valid UTF-8 string
324            unsafe { std::string::String::from_utf8_unchecked(basic).into() }
325        } else {
326            ZipString::new(basic, StringEncoding::Raw)
327        }
328    }
329}