Skip to main content

stream_unpack/zip/
mod.rs

1use thiserror::Error;
2
3use crate::{decrypt::DecryptorCreationError, pipeline::{Pipeline, PipelineError}};
4
5use self::structures::{local_file_header::{LocalFileHeader, LFH_SIGNATURE, LFH_CONSTANT_SIZE}, DecompressorCreationError, central_directory::{CentralDirectoryFileHeader, SortedCentralDirectory}};
6
7/// Provides utilities for wokring with ZIP structures
8pub mod structures;
9
10/// Provides utilities for automatically locating and reading a central directory
11pub mod read_cd;
12
13#[derive(Debug, Error)]
14pub enum DecoderError {
15    #[error("file pipeline failed: {0}")]
16    Pipeline(#[from] PipelineError),
17
18    #[error("could not create decompressor: {0}")]
19    DecompressorInit(#[from] DecompressorCreationError),
20
21    #[error("could not create decryptor: {0}")]
22    DecryptorInit(#[from] DecryptorCreationError),
23
24    #[error("no password provided for encrypted file")]
25    NoPassword,
26
27    #[error("data exceeded archive size")]
28    ExtraData,
29
30    #[error("next header is at {0} but current position is {1}, one of the disk sizes is probably invalid")]
31    Overshoot(ZipPosition, ZipPosition),
32
33    #[error("could not find a file with position {0} in the central directory")]
34    InvalidOffset(ZipPosition),
35
36    #[error("file header has an invalid signature")]
37    InvalidSignature,
38
39    #[error("error within callback: {0}")]
40    FromDecodeCallback(#[from] anyhow::Error)
41}
42
43#[derive(Debug)]
44enum ZipDecoderState {
45    FileHeader,
46    FileData(u64, LocalFileHeader, Pipeline)
47}
48
49/// Represents a position in a (possbly multipart) ZIP archive
50#[derive(Debug, PartialEq, Eq, PartialOrd, Ord, Clone, Copy, Default)]
51pub struct ZipPosition {
52    pub disk: usize,
53    pub offset: usize
54}
55
56impl std::fmt::Display for ZipPosition {
57    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
58        write!(f, "{}:{}", self.disk, self.offset)
59    }
60}
61
62impl ZipPosition {
63    /// Creates a new ZipPosition from the specified disk number and offset
64    pub fn new(disk: usize, offset: usize) -> Self {
65        Self {
66            offset,
67            disk
68        }
69    }
70
71    /// Creates a new ZipPosition from the offset with disk number 0
72    pub fn from_offset(offset: usize) -> Self {
73        Self::new(0, offset)
74    }
75}
76
77/// A chunk of decoded ZIP data
78#[derive(Debug)]
79pub enum ZipDecodedData<'a> {
80    /// The ZIP file headers for a file
81    FileHeader(&'a CentralDirectoryFileHeader, &'a LocalFileHeader),
82
83    /// Decoded (uncompressed or decompressed) file bytes
84    FileData(&'a [u8])
85}
86
87/// A stream unpacker for ZIP archives
88pub struct ZipUnpacker<'a> {
89    decoder_state: ZipDecoderState,
90    current_index: usize,
91    current_position: ZipPosition,
92
93    disk_sizes: Vec<usize>,
94    central_directory: SortedCentralDirectory,
95
96    password: Option<Vec<u8>>,
97
98    #[allow(clippy::type_complexity)]
99    on_decode: Option<Box<dyn Fn(ZipDecodedData) -> anyhow::Result<()> + 'a>>
100}
101
102impl std::fmt::Debug for ZipUnpacker<'_> {
103    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
104        f.debug_struct("ZipUnpacker")
105            .field("decoder_state", &self.decoder_state)
106            .field("current_index", &self.current_index)
107            .field("current_position", &self.current_position)
108            .field("disk_sizes", &self.disk_sizes)
109            .finish()
110    }
111}
112
113impl<'a> ZipUnpacker<'a> {
114    /// Creates a new ZipUnpacker
115    ///
116    /// The easiest way to obtain a central directory object is to use [read_cd::from_provider].
117    /// "disk_sizes" must only contain one element if the archive is a cut one, and not a
118    /// real split one.
119    pub fn new(central_directory: SortedCentralDirectory, disk_sizes: Vec<usize>) -> Self {
120        Self {
121            decoder_state: ZipDecoderState::FileHeader,
122            current_index: 0,
123            current_position: ZipPosition::default(),
124
125            disk_sizes,
126            central_directory,
127
128            password: None,
129
130            on_decode: None
131        }
132    }
133
134    /// Creates a new ZipUnpacker capable of decrypting files with a specified password.
135    /// See [ZipUnpacker::new] for more information.
136    pub fn new_with_password(central_directory: SortedCentralDirectory, disk_sizes: Vec<usize>, password: Vec<u8>) -> Self {
137        Self {
138            password: Some(password),
139            ..Self::new(central_directory, disk_sizes)
140        }
141    }
142
143    /// Creates a new ZipUnpacker, starting from the specified position. If the archive
144    /// is not actually split, you must set disk number to 0 and use the absolute offset,
145    /// even if there are multiple files
146    ///
147    /// The easiest way to obtain a central directory object is to use [read_cd::from_provider].
148    /// "disk_sizes" must only contain one element if the archive is a cut one, and not a
149    /// real split one.
150    pub fn resume(central_directory: SortedCentralDirectory, disk_sizes: Vec<usize>, position: ZipPosition) -> Result<Self, DecoderError> {
151        let index = central_directory.headers_ref()
152            .binary_search_by(|h| h.header_position().cmp(&position))
153            .map_err(|_| DecoderError::InvalidOffset(position))?;
154
155        Ok(Self {
156            decoder_state: ZipDecoderState::FileHeader,
157            current_index: index,
158            current_position: position,
159
160            disk_sizes,
161            central_directory,
162
163            password: None,
164
165            on_decode: None
166        })
167    }
168
169    /// Creates a new ZipUnpacker capable of decrypting files with the specified password, starting at
170    /// the specified position. See [ZipUnpacker::resume] for more information.
171    pub fn resume_with_password(central_directory: SortedCentralDirectory, disk_sizes: Vec<usize>, password: Vec<u8>, position: ZipPosition) -> Result<Self, DecoderError> {
172        Ok(Self {
173            password: Some(password),
174            ..(Self::resume(central_directory, disk_sizes, position)?)
175        })
176    }
177
178    /// Sets the decode callback. The passed closure will be invoked
179    /// when new data is decoded from bytes passed to [ZipUnpacker::update]
180    pub fn set_callback(&mut self, on_decode: impl Fn(ZipDecodedData) -> anyhow::Result<()> + 'a) {
181        self.on_decode = Some(Box::new(on_decode));
182    }
183
184    /// Update this ZipUnpacker with new bytes. The callback may or
185    /// may not be fired, depending on the content. The callback may
186    /// be fired multiple times.
187    ///
188    /// The first return value is how much the caller should advance the input buffer
189    /// (0 means that there wasn't enough data in the buffer and the caller should 
190    /// provide more), and the second value determines whether all files were processed 
191    /// (which means that the caller should stop providing data)
192    pub fn update(&mut self, data: impl AsRef<[u8]>) -> Result<(usize, bool), DecoderError> {
193        let data = data.as_ref();
194
195        let mut buf_offset = 0;
196        loop {
197            let (advanced, reached_end) = self.update_internal(&data[buf_offset..])?;
198            buf_offset += advanced;
199
200            self.current_position.offset += advanced;
201            if self.current_position.offset >= self.disk_sizes[self.current_position.disk] {
202                // Find which disk this offset will be at
203                let mut new_offset = self.current_position.offset;
204                let mut new_disk_number = None;
205                for d in (self.current_position.disk)..(self.disk_sizes.len() - 1) {
206                    new_offset -= self.disk_sizes[d];
207                    if new_offset < self.disk_sizes[d + 1] {
208                        new_disk_number = Some(d + 1);
209                        break;
210                    }
211                }
212
213                let Some(new_disk_number) = new_disk_number else {
214                    return Err(DecoderError::ExtraData);
215                };
216
217                self.current_position.offset = new_offset;
218                self.current_position.disk = new_disk_number;
219            }
220
221            if advanced == 0 || reached_end {
222                return Ok((buf_offset, reached_end));
223            }
224        }
225    }
226
227    fn update_internal(&mut self, data: impl AsRef<[u8]>) -> Result<(usize, bool), DecoderError> {
228        let headers = self.central_directory.headers_ref();
229        if self.current_index >= headers.len() {
230            return Ok((0, true));
231        }
232        let cdfh = &headers[self.current_index];
233
234        let data = data.as_ref();
235
236        match &mut self.decoder_state {
237            ZipDecoderState::FileHeader => {
238                if self.current_position > cdfh.header_position() {
239                    return Err(DecoderError::Overshoot(cdfh.header_position(), self.current_position));
240                }
241
242                if self.current_position.disk < cdfh.disk_number as usize {
243                    // Next disk
244                    return Ok((std::cmp::min(self.disk_sizes[self.current_position.disk] - self.current_position.offset, data.len()), false));
245                }
246
247                if self.current_position.offset < cdfh.local_header_offset as usize {
248                    return Ok((std::cmp::min(cdfh.local_header_offset as usize - self.current_position.offset, data.len()), false));
249                }
250
251                if data.len() < 4 + LFH_CONSTANT_SIZE {
252                    return Ok((0, false));
253                }
254
255                let signature = u32::from_le_bytes(data[..4].try_into().unwrap());
256                if signature != LFH_SIGNATURE {
257                    return Err(DecoderError::InvalidSignature);
258                }
259
260                let Some(lfh) = LocalFileHeader::from_bytes(&data[4..]) else {
261                    return Ok((0, false));
262                };
263                let header_size = lfh.header_size;
264
265                if let Some(on_decode) = &self.on_decode {
266                    (on_decode)(ZipDecodedData::FileHeader(cdfh, &lfh))?;
267                }
268
269                let decryptor = if lfh.is_encrypted() {
270                    if let Some(password) = &self.password {
271                        Some(lfh.create_decryptor(password)?)
272                    } else {
273                        return Err(DecoderError::NoPassword);
274                    }
275                } else { None };
276
277                if lfh.uncompressed_size != 0 {
278                    let decompressor = lfh.compression_method
279                        .as_ref()
280                        .map(|m| m.create_decompressor())
281                        .transpose()?;
282
283                    let pipeline = Pipeline::new(decryptor, decompressor);
284                    self.decoder_state = ZipDecoderState::FileData(0, lfh, pipeline);
285                } else {
286                    self.decoder_state = ZipDecoderState::FileHeader;
287                    self.current_index += 1;
288                }
289
290                Ok((4 + header_size, false))
291            },
292
293            ZipDecoderState::FileData(pos, lfh, pipeline) => {
294                let bytes_left = lfh.compressed_size - *pos;
295                let bytes_to_read = std::cmp::min(bytes_left as usize, data.len());
296                let file_bytes = &data[..bytes_to_read];
297
298                let (count, data) = pipeline.update(file_bytes)?;
299                *pos += count as u64;
300
301                if let Some(on_decode) = &self.on_decode {
302                    (on_decode)(ZipDecodedData::FileData(data))?;
303                }
304
305                if count as u64 == bytes_left {
306                    self.decoder_state = ZipDecoderState::FileHeader;
307                    self.current_index += 1;
308                }
309
310                Ok((count, false))
311            }
312        }
313    }
314}