stream_unpack/zip/
mod.rs

1use thiserror::Error;
2
3use crate::decompress::{Decompressor, DecompressionError};
4
5use self::structures::{local_file_header::{LocalFileHeader, LFH_SIGNATURE, LFH_CONSTANT_SIZE}, DecompressorCreationError, central_directory::{CentralDirectoryFileHeader, SortedCentralDirectory}};
6
7/// Provides utilities for wokring with ZIP structures 
8pub mod structures;
9
10/// Provides utilities for automatically locating and reading a central directory
11pub mod read_cd;
12
13#[derive(Debug, Error)]
14pub enum DecoderError {
15    #[error("failed to decompress: {0}")]
16    Decompression(#[from] DecompressionError),
17
18    #[error("could not create decompressor: {0}")]
19    DecompressorInit(#[from] DecompressorCreationError),
20
21    #[error("data exceeded archive size")]
22    ExtraData,
23
24    #[error("next header is at {0} but current position is {1}, one of the disk sizes is probably invalid")]
25    Overshoot(ZipPosition, ZipPosition),
26
27    #[error("could not find a file with position {0} in the central directory")]
28    InvalidOffset(ZipPosition),
29
30    #[error("file header has an invalid signature")]
31    InvalidSignature,
32
33    #[error("error within callback: {0}")]
34    FromDecodeCallback(#[from] anyhow::Error)
35}
36
37#[derive(Debug)]
38enum ZipDecoderState {
39    FileHeader,
40    FileData(u64, LocalFileHeader, Option<Box<dyn Decompressor>>)
41}
42
43/// Represents a position in a (possbly multipart) ZIP archive
44#[derive(Debug, PartialEq, Eq, PartialOrd, Ord, Clone, Copy, Default)]
45pub struct ZipPosition {
46    pub disk: usize,
47    pub offset: usize
48}
49
50impl std::fmt::Display for ZipPosition {
51    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
52        write!(f, "{}:{}", self.disk, self.offset)
53    }
54}
55
56impl ZipPosition {
57    /// Creates a new ZipPosition from the specified disk number and offset
58    pub fn new(disk: usize, offset: usize) -> Self {
59        Self {
60            offset,
61            disk
62        }
63    }
64
65    /// Creates a new ZipPosition from the offset with disk number 0
66    pub fn from_offset(offset: usize) -> Self {
67        Self::new(0, offset)
68    }
69}
70
71/// A chunk of decoded ZIP data
72#[derive(Debug)]
73pub enum ZipDecodedData<'a> {
74    /// The ZIP file headers for a file
75    FileHeader(&'a CentralDirectoryFileHeader, &'a LocalFileHeader),
76
77    /// Decoded (uncompressed or decompressed) file bytes 
78    FileData(&'a [u8])
79}
80
81/// A stream unpacker for ZIP archives
82pub struct ZipUnpacker<'a> {
83    decoder_state: ZipDecoderState,
84    current_index: usize,
85    current_position: ZipPosition,
86
87    disk_sizes: Vec<usize>,
88    central_directory: SortedCentralDirectory,
89
90    #[allow(clippy::type_complexity)]
91    on_decode: Option<Box<dyn Fn(ZipDecodedData) -> anyhow::Result<()> + 'a>>
92}
93
94impl std::fmt::Debug for ZipUnpacker<'_> {
95    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
96        f.debug_struct("ZipUnpacker")
97            .field("decoder_state", &self.decoder_state)
98            .field("current_index", &self.current_index)
99            .field("current_position", &self.current_position)
100            .field("disk_sizes", &self.disk_sizes)
101            .finish()
102    }
103}
104
105impl<'a> ZipUnpacker<'a> {
106    /// Creates a new ZipUnpacker
107    /// 
108    /// The easiest way to obtain a central directory object is to use [read_cd::from_provider].
109    /// "disk_sizes" must only contain one element if the archive is a cut one, and not a
110    /// real split one.
111    pub fn new(central_directory: SortedCentralDirectory, disk_sizes: Vec<usize>) -> Self {
112        Self {
113            decoder_state: ZipDecoderState::FileHeader,
114            current_index: 0,
115            current_position: ZipPosition::default(),
116
117            disk_sizes,
118            central_directory,
119
120            on_decode: None
121        }
122    }
123
124    /// Creates a new ZipUnpacker, starting from the specified position. If the archive
125    /// is not actually split, you must set disk number to 0 and use the absolute offset,
126    /// even if there are multiple files
127    /// 
128    /// The easiest way to obtain a central directory object is to use [read_cd::from_provider].
129    /// "disk_sizes" must only contain one element if the archive is a cut one, and not a
130    /// real split one.
131    pub fn resume(central_directory: SortedCentralDirectory, disk_sizes: Vec<usize>, position: ZipPosition) -> Result<Self, DecoderError> {
132        let index = central_directory.headers_ref()
133            .binary_search_by(|h| h.header_position().cmp(&position))
134            .map_err(|_| DecoderError::InvalidOffset(position))?;
135
136        Ok(Self {
137            decoder_state: ZipDecoderState::FileHeader,
138            current_index: index,
139            current_position: position,
140
141            disk_sizes,
142            central_directory,
143
144            on_decode: None
145        })
146    }
147
148    /// Sets the decode callback. The passed closure will be invoked
149    /// when new data is decoded from bytes passed to [ZipUnpacker::update]
150    pub fn set_callback(&mut self, on_decode: impl Fn(ZipDecodedData) -> anyhow::Result<()> + 'a) {
151        self.on_decode = Some(Box::new(on_decode));
152    }
153
154    /// Update this ZipUnpacker with new bytes. The callback may or
155    /// may not be fired, depending on the content. The callback may
156    /// be fired multiple times.
157    /// 
158    /// The first return value is how much the caller should advance the input buffer
159    /// (0 means that there wasn't enough data in the buffer and the caller should 
160    /// provide more), and the second value determines whether all files were processed 
161    /// (which means that the caller should stop providing data)
162    pub fn update(&mut self, data: impl AsRef<[u8]>) -> Result<(usize, bool), DecoderError> {
163        let data = data.as_ref();
164
165        let mut buf_offset = 0;
166        loop {
167            let (advanced, reached_end) = self.update_internal(&data[buf_offset..])?;
168            buf_offset += advanced;
169
170            self.current_position.offset += advanced;
171            if self.current_position.offset >= self.disk_sizes[self.current_position.disk] {
172                // Find which disk this offset will be at
173                let mut new_offset = self.current_position.offset;
174                let mut new_disk_number = None;
175                for d in (self.current_position.disk)..(self.disk_sizes.len() - 1) {
176                    new_offset -= self.disk_sizes[d];
177                    if new_offset < self.disk_sizes[d + 1] {
178                        new_disk_number = Some(d + 1);
179                        break;
180                    }
181                }
182
183                let Some(new_disk_number) = new_disk_number else {
184                    return Err(DecoderError::ExtraData);
185                };
186
187                self.current_position.offset = new_offset;
188                self.current_position.disk = new_disk_number;
189            }
190
191            if advanced == 0 || reached_end {
192                return Ok((buf_offset, reached_end));
193            }
194        }
195    }
196
197    fn update_internal(&mut self, data: impl AsRef<[u8]>) -> Result<(usize, bool), DecoderError> {
198        let headers = self.central_directory.headers_ref();
199        if self.current_index >= headers.len() {
200            return Ok((0, true));
201        }
202        let cdfh = &headers[self.current_index];
203
204        let data = data.as_ref();
205
206        match &mut self.decoder_state {
207            ZipDecoderState::FileHeader => {
208                if self.current_position > cdfh.header_position() {
209                    return Err(DecoderError::Overshoot(cdfh.header_position(), self.current_position));
210                }
211
212                if self.current_position.disk < cdfh.disk_number as usize {
213                    // Next disk
214                    return Ok((std::cmp::min(self.disk_sizes[self.current_position.disk] - self.current_position.offset, data.len()), false));
215                }
216
217                if self.current_position.offset < cdfh.local_header_offset as usize {
218                    return Ok((std::cmp::min(cdfh.local_header_offset as usize - self.current_position.offset, data.len()), false));
219                }
220
221                if data.len() < 4 + LFH_CONSTANT_SIZE {
222                    return Ok((0, false));
223                }
224
225                let signature = u32::from_le_bytes(data[..4].try_into().unwrap());
226                if signature != LFH_SIGNATURE {
227                    return Err(DecoderError::InvalidSignature);
228                }
229
230                let Some(lfh) = LocalFileHeader::from_bytes(&data[4..]) else {
231                    return Ok((0, false));
232                };
233                let header_size = lfh.header_size;
234
235                if let Some(on_decode) = &self.on_decode {
236                    (on_decode)(ZipDecodedData::FileHeader(cdfh, &lfh))?;
237                }
238
239                if lfh.uncompressed_size != 0 {
240                    let decompressor = lfh.compression_method
241                        .as_ref()
242                        .map(|m| m.create_decompressor())
243                        .transpose()?;
244
245                    self.decoder_state = ZipDecoderState::FileData(0, lfh, decompressor);
246                } else {
247                    self.decoder_state = ZipDecoderState::FileHeader;
248                    self.current_index += 1;
249                }
250
251                Ok((4 + header_size, false))
252            },
253
254            ZipDecoderState::FileData(pos, lfh, decompressor) => {
255                let bytes_left = lfh.compressed_size - *pos;
256                let bytes_to_read = std::cmp::min(bytes_left as usize, data.len());
257                let file_bytes = &data[..bytes_to_read];
258
259                let (count, decompressed) = if let Some(decompressor) = decompressor {
260                    decompressor.update(file_bytes)?
261                } else {
262                    (bytes_to_read, file_bytes)
263                };
264                *pos += count as u64;
265
266                if let Some(on_decode) = &self.on_decode {
267                    (on_decode)(ZipDecodedData::FileData(decompressed))?;
268                }
269
270                if count as u64 == bytes_left {
271                    self.decoder_state = ZipDecoderState::FileHeader;
272                    self.current_index += 1;
273                }
274
275                Ok((count, false))
276            }
277        }
278    }
279}