s_zip/
reader.rs

1//! Streaming ZIP reader - reads ZIP files without loading entire central directory
2//!
3//! This is a minimal ZIP reader that can extract specific files from a ZIP archive
4//! without loading the entire central directory into memory.
5
6use crate::error::{Result, SZipError};
7use flate2::read::DeflateDecoder;
8use std::fs::File;
9use std::io::{BufReader, Read, Seek, SeekFrom};
10use std::path::Path;
11
12/// ZIP local file header signature
13const LOCAL_FILE_HEADER_SIGNATURE: u32 = 0x04034b50;
14
15/// ZIP central directory signature
16const CENTRAL_DIRECTORY_SIGNATURE: u32 = 0x02014b50;
17
18/// ZIP end of central directory signature
19const END_OF_CENTRAL_DIRECTORY_SIGNATURE: u32 = 0x06054b50;
20
21/// ZIP64 end of central directory record signature
22const ZIP64_END_OF_CENTRAL_DIRECTORY_SIGNATURE: u32 = 0x06064b50;
23
24// ZIP64 end of central directory locator signature (not used as a u32 constant)
25
26/// Entry in the ZIP central directory
27#[derive(Debug, Clone)]
28pub struct ZipEntry {
29    pub name: String,
30    pub compressed_size: u64,
31    pub uncompressed_size: u64,
32    pub compression_method: u16,
33    pub offset: u64,
34}
35
36/// Streaming ZIP archive reader
37pub struct StreamingZipReader {
38    file: BufReader<File>,
39    entries: Vec<ZipEntry>,
40}
41
42impl StreamingZipReader {
43    /// Open a ZIP file and read its central directory
44    pub fn open<P: AsRef<Path>>(path: P) -> Result<Self> {
45        let mut file = BufReader::new(File::open(path)?);
46
47        // Find and read central directory
48        let entries = Self::read_central_directory(&mut file)?;
49
50        Ok(StreamingZipReader { file, entries })
51    }
52
53    /// Get list of all entries in the ZIP
54    pub fn entries(&self) -> &[ZipEntry] {
55        &self.entries
56    }
57
58    /// Find an entry by name
59    pub fn find_entry(&self, name: &str) -> Option<&ZipEntry> {
60        self.entries.iter().find(|e| e.name == name)
61    }
62
63    /// Read an entry's decompressed data into a vector
64    pub fn read_entry(&mut self, entry: &ZipEntry) -> Result<Vec<u8>> {
65        // Seek to local file header
66        self.file.seek(SeekFrom::Start(entry.offset))?;
67
68        // Read and verify local file header
69        let signature = self.read_u32_le()?;
70        if signature != LOCAL_FILE_HEADER_SIGNATURE {
71            return Err(SZipError::InvalidFormat(
72                "Invalid local file header signature".to_string(),
73            ));
74        }
75
76        // Skip version, flags, compression method
77        self.file.seek(SeekFrom::Current(6))?;
78
79        // Skip modification time and date, CRC-32
80        self.file.seek(SeekFrom::Current(8))?;
81
82        // Read compressed and uncompressed sizes (already known from central directory)
83        self.file.seek(SeekFrom::Current(8))?;
84
85        // Read filename length and extra field length
86        let filename_len = self.read_u16_le()? as i64;
87        let extra_len = self.read_u16_le()? as i64;
88
89        // Skip filename and extra field
90        self.file
91            .seek(SeekFrom::Current(filename_len + extra_len))?;
92
93        // Now read the compressed data
94        let mut compressed_data = vec![0u8; entry.compressed_size as usize];
95        self.file.read_exact(&mut compressed_data)?;
96
97        // Decompress if needed
98        let data = if entry.compression_method == 8 {
99            // DEFLATE compression
100            let mut decoder = DeflateDecoder::new(&compressed_data[..]);
101            let mut decompressed = Vec::new();
102            decoder.read_to_end(&mut decompressed)?;
103            decompressed
104        } else if entry.compression_method == 0 {
105            // No compression (stored)
106            compressed_data
107        } else {
108            return Err(SZipError::UnsupportedCompression(entry.compression_method));
109        };
110
111        Ok(data)
112    }
113
114    /// Read an entry by name
115    pub fn read_entry_by_name(&mut self, name: &str) -> Result<Vec<u8>> {
116        let entry = self
117            .find_entry(name)
118            .ok_or_else(|| SZipError::EntryNotFound(name.to_string()))?
119            .clone();
120
121        self.read_entry(&entry)
122    }
123
124    /// Get a streaming reader for an entry by name (for large files)
125    /// Returns a reader that decompresses data on-the-fly without loading everything into memory
126    pub fn read_entry_streaming_by_name(&mut self, name: &str) -> Result<Box<dyn Read + '_>> {
127        let entry = self
128            .find_entry(name)
129            .ok_or_else(|| SZipError::EntryNotFound(name.to_string()))?
130            .clone();
131
132        self.read_entry_streaming(&entry)
133    }
134
135    /// Get a streaming reader for an entry (for large files)
136    /// Returns a reader that decompresses data on-the-fly without loading everything into memory
137    pub fn read_entry_streaming(&mut self, entry: &ZipEntry) -> Result<Box<dyn Read + '_>> {
138        // Seek to local file header
139        self.file.seek(SeekFrom::Start(entry.offset))?;
140
141        // Read and verify local file header
142        let signature = self.read_u32_le()?;
143        if signature != LOCAL_FILE_HEADER_SIGNATURE {
144            return Err(SZipError::InvalidFormat(
145                "Invalid local file header signature".to_string(),
146            ));
147        }
148
149        // Skip version, flags, compression method
150        self.file.seek(SeekFrom::Current(6))?;
151
152        // Skip modification time and date, CRC-32
153        self.file.seek(SeekFrom::Current(8))?;
154
155        // Read compressed and uncompressed sizes
156        self.file.seek(SeekFrom::Current(8))?;
157
158        // Read filename length and extra field length
159        let filename_len = self.read_u16_le()? as i64;
160        let extra_len = self.read_u16_le()? as i64;
161
162        // Skip filename and extra field
163        self.file
164            .seek(SeekFrom::Current(filename_len + extra_len))?;
165
166        // Create a reader limited to compressed data size
167        let limited_reader = (&mut self.file).take(entry.compressed_size);
168
169        // Wrap with decompressor if needed
170        if entry.compression_method == 8 {
171            // DEFLATE compression
172            Ok(Box::new(DeflateDecoder::new(limited_reader)))
173        } else if entry.compression_method == 0 {
174            // No compression (stored)
175            Ok(Box::new(limited_reader))
176        } else {
177            Err(SZipError::UnsupportedCompression(entry.compression_method))
178        }
179    }
180
181    /// Get a streaming reader for an entry by name
182    pub fn read_entry_by_name_streaming(&mut self, name: &str) -> Result<Box<dyn Read + '_>> {
183        let entry = self
184            .find_entry(name)
185            .ok_or_else(|| SZipError::EntryNotFound(name.to_string()))?
186            .clone();
187
188        self.read_entry_streaming(&entry)
189    }
190
191    /// Read the central directory from the ZIP file
192    fn read_central_directory(file: &mut BufReader<File>) -> Result<Vec<ZipEntry>> {
193        // Find end of central directory record
194        let eocd_offset = Self::find_eocd(file)?;
195
196        // Seek to EOCD
197        file.seek(SeekFrom::Start(eocd_offset))?;
198
199        // Read EOCD
200        let signature = Self::read_u32_le_static(file)?;
201        if signature != END_OF_CENTRAL_DIRECTORY_SIGNATURE {
202            return Err(SZipError::InvalidFormat(format!(
203                "Invalid end of central directory signature: 0x{:08x}",
204                signature
205            )));
206        }
207
208        // Skip disk number fields (4 bytes)
209        file.seek(SeekFrom::Current(4))?;
210
211        // Read number of entries on this disk (2 bytes)
212        let _entries_on_disk = Self::read_u16_le_static(file)?;
213
214        // Read total number of entries (2 bytes)
215
216        // These values may be placeholder 0xFFFF/0xFFFFFFFF when ZIP64 is used
217        let total_entries_16 = Self::read_u16_le_static(file)?;
218
219        // Read central directory size (4 bytes)
220        let cd_size_32 = Self::read_u32_le_static(file)?;
221
222        // Read central directory offset (4 bytes)
223        let cd_offset_32 = Self::read_u32_le_static(file)? as u64;
224
225        // Promote to u64 and handle ZIP64 if markers present
226        let mut total_entries = total_entries_16 as usize;
227        let mut cd_offset = cd_offset_32;
228        let _cd_size = cd_size_32 as u64;
229
230        if total_entries_16 == 0xFFFF || cd_size_32 == 0xFFFFFFFF || cd_offset_32 == 0xFFFFFFFF {
231            // Need to find ZIP64 EOCD locator and read ZIP64 EOCD record
232            let (zip64_total_entries, zip64_cd_size, zip64_cd_offset) =
233                Self::read_zip64_eocd(file, eocd_offset)?;
234            total_entries = zip64_total_entries as usize;
235            cd_offset = zip64_cd_offset;
236            // _cd_size can be used if needed (zip64_cd_size)
237            let _ = zip64_cd_size;
238        }
239
240        // Seek to central directory
241        file.seek(SeekFrom::Start(cd_offset))?;
242
243        // Read all central directory entries
244        let mut entries = Vec::with_capacity(total_entries);
245        for _ in 0..total_entries {
246            let signature = Self::read_u32_le_static(file)?;
247            if signature != CENTRAL_DIRECTORY_SIGNATURE {
248                break;
249            }
250
251            // Skip version made by, version needed, flags
252            file.seek(SeekFrom::Current(6))?;
253
254            let compression_method = Self::read_u16_le_static(file)?;
255
256            // Skip modification time, date, CRC-32
257            file.seek(SeekFrom::Current(8))?;
258
259            // Read sizes as 32-bit placeholders (may be 0xFFFFFFFF meaning ZIP64)
260            let compressed_size_32 = Self::read_u32_le_static(file)? as u64;
261            let uncompressed_size_32 = Self::read_u32_le_static(file)? as u64;
262            let filename_len = Self::read_u16_le_static(file)? as usize;
263            let extra_len = Self::read_u16_le_static(file)? as usize;
264            let comment_len = Self::read_u16_le_static(file)? as usize;
265
266            // Skip disk number, internal attributes, external attributes
267            file.seek(SeekFrom::Current(8))?;
268
269            let mut offset = Self::read_u32_le_static(file)? as u64;
270
271            // Read filename
272            let mut filename_buf = vec![0u8; filename_len];
273            file.read_exact(&mut filename_buf)?;
274            let name = String::from_utf8_lossy(&filename_buf).to_string();
275
276            // Read extra field so we can parse ZIP64 extra if present
277            let mut extra_buf = vec![0u8; extra_len];
278            if extra_len > 0 {
279                file.read_exact(&mut extra_buf)?;
280            }
281
282            // If sizes/offsets are 0xFFFFFFFF, parse ZIP64 extra field (0x0001)
283            let mut compressed_size = compressed_size_32;
284            let mut uncompressed_size = uncompressed_size_32;
285
286            if compressed_size_32 == 0xFFFFFFFF
287                || uncompressed_size_32 == 0xFFFFFFFF
288                || offset == 0xFFFFFFFF
289            {
290                // parse extra fields
291                let mut i = 0usize;
292                while i + 4 <= extra_buf.len() {
293                    let id = u16::from_le_bytes([extra_buf[i], extra_buf[i + 1]]);
294                    let data_len =
295                        u16::from_le_bytes([extra_buf[i + 2], extra_buf[i + 3]]) as usize;
296                    i += 4;
297                    if i + data_len > extra_buf.len() {
298                        break;
299                    }
300                    if id == 0x0001 {
301                        // ZIP64 extra field: contains values in order: original size, compressed size, relative header offset, disk start
302                        let mut cursor = 0usize;
303                        // read uncompressed size if placeholder present
304                        if uncompressed_size_32 == 0xFFFFFFFF && cursor + 8 <= data_len {
305                            uncompressed_size = u64::from_le_bytes([
306                                extra_buf[i + cursor],
307                                extra_buf[i + cursor + 1],
308                                extra_buf[i + cursor + 2],
309                                extra_buf[i + cursor + 3],
310                                extra_buf[i + cursor + 4],
311                                extra_buf[i + cursor + 5],
312                                extra_buf[i + cursor + 6],
313                                extra_buf[i + cursor + 7],
314                            ]);
315                            cursor += 8;
316                        }
317                        // read compressed size if placeholder present
318                        if compressed_size_32 == 0xFFFFFFFF && cursor + 8 <= data_len {
319                            compressed_size = u64::from_le_bytes([
320                                extra_buf[i + cursor],
321                                extra_buf[i + cursor + 1],
322                                extra_buf[i + cursor + 2],
323                                extra_buf[i + cursor + 3],
324                                extra_buf[i + cursor + 4],
325                                extra_buf[i + cursor + 5],
326                                extra_buf[i + cursor + 6],
327                                extra_buf[i + cursor + 7],
328                            ]);
329                            cursor += 8;
330                        }
331                        // read offset if placeholder present
332                        if offset == 0xFFFFFFFF && cursor + 8 <= data_len {
333                            offset = u64::from_le_bytes([
334                                extra_buf[i + cursor],
335                                extra_buf[i + cursor + 1],
336                                extra_buf[i + cursor + 2],
337                                extra_buf[i + cursor + 3],
338                                extra_buf[i + cursor + 4],
339                                extra_buf[i + cursor + 5],
340                                extra_buf[i + cursor + 6],
341                                extra_buf[i + cursor + 7],
342                            ]);
343                        }
344                        // we don't need disk start here
345                        break;
346                    }
347                    i += data_len;
348                }
349            }
350
351            // Skip comment
352            if comment_len > 0 {
353                file.seek(SeekFrom::Current(comment_len as i64))?;
354            }
355
356            entries.push(ZipEntry {
357                name,
358                compressed_size,
359                uncompressed_size,
360                compression_method,
361                offset,
362            });
363        }
364
365        Ok(entries)
366    }
367
368    /// When EOCD indicates ZIP64 usage, find and read ZIP64 EOCD locator and record
369    fn read_zip64_eocd(file: &mut BufReader<File>, eocd_offset: u64) -> Result<(u64, u64, u64)> {
370        // Search backwards from EOCD for ZIP64 EOCD locator signature (50 4b 06 07)
371        let search_start = eocd_offset.saturating_sub(65557);
372        file.seek(SeekFrom::Start(search_start))?;
373        let mut buffer = Vec::new();
374        file.read_to_end(&mut buffer)?;
375
376        let mut locator_pos: Option<usize> = None;
377        for i in (0..buffer.len().saturating_sub(3)).rev() {
378            if buffer[i] == 0x50
379                && buffer[i + 1] == 0x4b
380                && buffer[i + 2] == 0x06
381                && buffer[i + 3] == 0x07
382            {
383                locator_pos = Some(i);
384                break;
385            }
386        }
387
388        let locator_pos = locator_pos
389            .ok_or_else(|| SZipError::InvalidFormat("ZIP64 EOCD locator not found".to_string()))?;
390
391        // Read locator fields from buffer
392        // locator layout: signature(4), number of the disk with the start of the zip64 eocd(4), relative offset of the zip64 eocd(8), total number of disks(4)
393        let rel_off_bytes = &buffer[locator_pos + 8..locator_pos + 16];
394        let zip64_eocd_offset = u64::from_le_bytes([
395            rel_off_bytes[0],
396            rel_off_bytes[1],
397            rel_off_bytes[2],
398            rel_off_bytes[3],
399            rel_off_bytes[4],
400            rel_off_bytes[5],
401            rel_off_bytes[6],
402            rel_off_bytes[7],
403        ]);
404
405        // Seek to ZIP64 EOCD record
406        file.seek(SeekFrom::Start(zip64_eocd_offset))?;
407
408        let sig = Self::read_u32_le_static(file)?;
409        if sig != ZIP64_END_OF_CENTRAL_DIRECTORY_SIGNATURE {
410            return Err(SZipError::InvalidFormat(format!(
411                "Invalid ZIP64 EOCD signature: 0x{:08x}",
412                sig
413            )));
414        }
415
416        // size of ZIP64 EOCD record (8 bytes)
417        let _size = {
418            let mut buf = [0u8; 8];
419            file.read_exact(&mut buf)?;
420            u64::from_le_bytes(buf)
421        };
422
423        // skip version made by (2), version needed (2), disk number (4), disk where central dir starts (4)
424        file.seek(SeekFrom::Current(12))?;
425
426        // total number of entries on this disk (8)
427        let total_entries = {
428            let mut buf = [0u8; 8];
429            file.read_exact(&mut buf)?;
430            u64::from_le_bytes(buf)
431        };
432
433        // total number of entries (8) - some implementations write both; ignore the second value
434        {
435            let mut buf = [0u8; 8];
436            file.read_exact(&mut buf)?;
437            // ignore u64::from_le_bytes(buf)
438        }
439
440        // central directory size (8)
441        let cd_size = {
442            let mut buf = [0u8; 8];
443            file.read_exact(&mut buf)?;
444            u64::from_le_bytes(buf)
445        };
446
447        // central directory offset (8)
448        let cd_offset = {
449            let mut buf = [0u8; 8];
450            file.read_exact(&mut buf)?;
451            u64::from_le_bytes(buf)
452        };
453
454        Ok((total_entries, cd_size, cd_offset))
455    }
456
457    /// Find the end of central directory record by scanning from the end of the file
458    fn find_eocd(file: &mut BufReader<File>) -> Result<u64> {
459        let file_size = file.seek(SeekFrom::End(0))?;
460
461        // EOCD is at least 22 bytes, search last 65KB (max comment size + EOCD)
462        let search_start = file_size.saturating_sub(65557);
463        file.seek(SeekFrom::Start(search_start))?;
464
465        let mut buffer = Vec::new();
466        file.read_to_end(&mut buffer)?;
467
468        // Search for EOCD signature from the end
469        for i in (0..buffer.len().saturating_sub(3)).rev() {
470            if buffer[i] == 0x50
471                && buffer[i + 1] == 0x4b
472                && buffer[i + 2] == 0x05
473                && buffer[i + 3] == 0x06
474            {
475                return Ok(search_start + i as u64);
476            }
477        }
478
479        Err(SZipError::InvalidFormat(
480            "End of central directory not found".to_string(),
481        ))
482    }
483
484    fn read_u16_le(&mut self) -> Result<u16> {
485        let mut buf = [0u8; 2];
486        self.file.read_exact(&mut buf)?;
487        Ok(u16::from_le_bytes(buf))
488    }
489
490    fn read_u32_le(&mut self) -> Result<u32> {
491        let mut buf = [0u8; 4];
492        self.file.read_exact(&mut buf)?;
493        Ok(u32::from_le_bytes(buf))
494    }
495
496    fn read_u16_le_static(file: &mut BufReader<File>) -> Result<u16> {
497        let mut buf = [0u8; 2];
498        file.read_exact(&mut buf)?;
499        Ok(u16::from_le_bytes(buf))
500    }
501
502    fn read_u32_le_static(file: &mut BufReader<File>) -> Result<u32> {
503        let mut buf = [0u8; 4];
504        file.read_exact(&mut buf)?;
505        Ok(u32::from_le_bytes(buf))
506    }
507}