s_zip/
reader.rs

1//! Streaming ZIP reader - reads ZIP files without loading entire central directory
2//!
3//! This is a minimal ZIP reader that can extract specific files from a ZIP archive
4//! without loading the entire central directory into memory.
5
6use crate::error::{Result, SZipError};
7use flate2::read::DeflateDecoder;
8use std::fs::File;
9use std::io::{BufReader, Read, Seek, SeekFrom};
10use std::path::Path;
11
12/// ZIP local file header signature
13const LOCAL_FILE_HEADER_SIGNATURE: u32 = 0x04034b50;
14
15/// ZIP central directory signature
16const CENTRAL_DIRECTORY_SIGNATURE: u32 = 0x02014b50;
17
18/// ZIP end of central directory signature
19const END_OF_CENTRAL_DIRECTORY_SIGNATURE: u32 = 0x06054b50;
20
21/// ZIP64 end of central directory record signature
22const ZIP64_END_OF_CENTRAL_DIRECTORY_SIGNATURE: u32 = 0x06064b50;
23
24// ZIP64 end of central directory locator signature (not used as a u32 constant)
25
26/// Entry in the ZIP central directory
27#[derive(Debug, Clone)]
28pub struct ZipEntry {
29    pub name: String,
30    pub compressed_size: u64,
31    pub uncompressed_size: u64,
32    pub compression_method: u16,
33    pub offset: u64,
34}
35
36/// Streaming ZIP archive reader
37pub struct StreamingZipReader {
38    file: BufReader<File>,
39    entries: Vec<ZipEntry>,
40}
41
42impl StreamingZipReader {
43    /// Open a ZIP file and read its central directory
44    pub fn open<P: AsRef<Path>>(path: P) -> Result<Self> {
45        let mut file = BufReader::new(File::open(path)?);
46
47        // Find and read central directory
48        let entries = Self::read_central_directory(&mut file)?;
49
50        Ok(StreamingZipReader { file, entries })
51    }
52
53    /// Get list of all entries in the ZIP
54    pub fn entries(&self) -> &[ZipEntry] {
55        &self.entries
56    }
57
58    /// Find an entry by name
59    pub fn find_entry(&self, name: &str) -> Option<&ZipEntry> {
60        self.entries.iter().find(|e| e.name == name)
61    }
62
63    /// Read an entry's decompressed data into a vector
64    pub fn read_entry(&mut self, entry: &ZipEntry) -> Result<Vec<u8>> {
65        // Seek to local file header
66        self.file.seek(SeekFrom::Start(entry.offset))?;
67
68        // Read and verify local file header
69        let signature = self.read_u32_le()?;
70        if signature != LOCAL_FILE_HEADER_SIGNATURE {
71            return Err(SZipError::InvalidFormat(
72                "Invalid local file header signature".to_string(),
73            ));
74        }
75
76        // Skip version, flags, compression method
77        self.file.seek(SeekFrom::Current(6))?;
78
79        // Skip modification time and date, CRC-32
80        self.file.seek(SeekFrom::Current(8))?;
81
82        // Read compressed and uncompressed sizes (already known from central directory)
83        self.file.seek(SeekFrom::Current(8))?;
84
85        // Read filename length and extra field length
86        let filename_len = self.read_u16_le()? as i64;
87        let extra_len = self.read_u16_le()? as i64;
88
89        // Skip filename and extra field
90        self.file
91            .seek(SeekFrom::Current(filename_len + extra_len))?;
92
93        // Now read the compressed data
94        let mut compressed_data = vec![0u8; entry.compressed_size as usize];
95        self.file.read_exact(&mut compressed_data)?;
96
97        // Decompress if needed
98        let data = if entry.compression_method == 8 {
99            // DEFLATE compression
100            let mut decoder = DeflateDecoder::new(&compressed_data[..]);
101            let mut decompressed = Vec::new();
102            decoder.read_to_end(&mut decompressed)?;
103            decompressed
104        } else if entry.compression_method == 0 {
105            // No compression (stored)
106            compressed_data
107        } else if entry.compression_method == 93 {
108            // Zstd compression
109            #[cfg(feature = "zstd-support")]
110            {
111                zstd::decode_all(&compressed_data[..])?
112            }
113            #[cfg(not(feature = "zstd-support"))]
114            {
115                return Err(SZipError::UnsupportedCompression(entry.compression_method));
116            }
117        } else {
118            return Err(SZipError::UnsupportedCompression(entry.compression_method));
119        };
120
121        Ok(data)
122    }
123
124    /// Read an entry by name
125    pub fn read_entry_by_name(&mut self, name: &str) -> Result<Vec<u8>> {
126        let entry = self
127            .find_entry(name)
128            .ok_or_else(|| SZipError::EntryNotFound(name.to_string()))?
129            .clone();
130
131        self.read_entry(&entry)
132    }
133
134    /// Get a streaming reader for an entry by name (for large files)
135    /// Returns a reader that decompresses data on-the-fly without loading everything into memory
136    pub fn read_entry_streaming_by_name(&mut self, name: &str) -> Result<Box<dyn Read + '_>> {
137        let entry = self
138            .find_entry(name)
139            .ok_or_else(|| SZipError::EntryNotFound(name.to_string()))?
140            .clone();
141
142        self.read_entry_streaming(&entry)
143    }
144
145    /// Get a streaming reader for an entry (for large files)
146    /// Returns a reader that decompresses data on-the-fly without loading everything into memory
147    pub fn read_entry_streaming(&mut self, entry: &ZipEntry) -> Result<Box<dyn Read + '_>> {
148        // Seek to local file header
149        self.file.seek(SeekFrom::Start(entry.offset))?;
150
151        // Read and verify local file header
152        let signature = self.read_u32_le()?;
153        if signature != LOCAL_FILE_HEADER_SIGNATURE {
154            return Err(SZipError::InvalidFormat(
155                "Invalid local file header signature".to_string(),
156            ));
157        }
158
159        // Skip version, flags, compression method
160        self.file.seek(SeekFrom::Current(6))?;
161
162        // Skip modification time and date, CRC-32
163        self.file.seek(SeekFrom::Current(8))?;
164
165        // Read compressed and uncompressed sizes
166        self.file.seek(SeekFrom::Current(8))?;
167
168        // Read filename length and extra field length
169        let filename_len = self.read_u16_le()? as i64;
170        let extra_len = self.read_u16_le()? as i64;
171
172        // Skip filename and extra field
173        self.file
174            .seek(SeekFrom::Current(filename_len + extra_len))?;
175
176        // Create a reader limited to compressed data size
177        let limited_reader = (&mut self.file).take(entry.compressed_size);
178
179        // Wrap with decompressor if needed
180        if entry.compression_method == 8 {
181            // DEFLATE compression
182            Ok(Box::new(DeflateDecoder::new(limited_reader)))
183        } else if entry.compression_method == 0 {
184            // No compression (stored)
185            Ok(Box::new(limited_reader))
186        } else if entry.compression_method == 93 {
187            // Zstd compression
188            #[cfg(feature = "zstd-support")]
189            {
190                Ok(Box::new(zstd::Decoder::new(limited_reader)?))
191            }
192            #[cfg(not(feature = "zstd-support"))]
193            {
194                Err(SZipError::UnsupportedCompression(entry.compression_method))
195            }
196        } else {
197            Err(SZipError::UnsupportedCompression(entry.compression_method))
198        }
199    }
200
201    /// Get a streaming reader for an entry by name
202    pub fn read_entry_by_name_streaming(&mut self, name: &str) -> Result<Box<dyn Read + '_>> {
203        let entry = self
204            .find_entry(name)
205            .ok_or_else(|| SZipError::EntryNotFound(name.to_string()))?
206            .clone();
207
208        self.read_entry_streaming(&entry)
209    }
210
211    /// Read the central directory from the ZIP file
212    fn read_central_directory(file: &mut BufReader<File>) -> Result<Vec<ZipEntry>> {
213        // Find end of central directory record
214        let eocd_offset = Self::find_eocd(file)?;
215
216        // Seek to EOCD
217        file.seek(SeekFrom::Start(eocd_offset))?;
218
219        // Read EOCD
220        let signature = Self::read_u32_le_static(file)?;
221        if signature != END_OF_CENTRAL_DIRECTORY_SIGNATURE {
222            return Err(SZipError::InvalidFormat(format!(
223                "Invalid end of central directory signature: 0x{:08x}",
224                signature
225            )));
226        }
227
228        // Skip disk number fields (4 bytes)
229        file.seek(SeekFrom::Current(4))?;
230
231        // Read number of entries on this disk (2 bytes)
232        let _entries_on_disk = Self::read_u16_le_static(file)?;
233
234        // Read total number of entries (2 bytes)
235
236        // These values may be placeholder 0xFFFF/0xFFFFFFFF when ZIP64 is used
237        let total_entries_16 = Self::read_u16_le_static(file)?;
238
239        // Read central directory size (4 bytes)
240        let cd_size_32 = Self::read_u32_le_static(file)?;
241
242        // Read central directory offset (4 bytes)
243        let cd_offset_32 = Self::read_u32_le_static(file)? as u64;
244
245        // Promote to u64 and handle ZIP64 if markers present
246        let mut total_entries = total_entries_16 as usize;
247        let mut cd_offset = cd_offset_32;
248        let _cd_size = cd_size_32 as u64;
249
250        if total_entries_16 == 0xFFFF || cd_size_32 == 0xFFFFFFFF || cd_offset_32 == 0xFFFFFFFF {
251            // Need to find ZIP64 EOCD locator and read ZIP64 EOCD record
252            let (zip64_total_entries, zip64_cd_size, zip64_cd_offset) =
253                Self::read_zip64_eocd(file, eocd_offset)?;
254            total_entries = zip64_total_entries as usize;
255            cd_offset = zip64_cd_offset;
256            // _cd_size can be used if needed (zip64_cd_size)
257            let _ = zip64_cd_size;
258        }
259
260        // Seek to central directory
261        file.seek(SeekFrom::Start(cd_offset))?;
262
263        // Read all central directory entries
264        let mut entries = Vec::with_capacity(total_entries);
265        for _ in 0..total_entries {
266            let signature = Self::read_u32_le_static(file)?;
267            if signature != CENTRAL_DIRECTORY_SIGNATURE {
268                break;
269            }
270
271            // Skip version made by, version needed, flags
272            file.seek(SeekFrom::Current(6))?;
273
274            let compression_method = Self::read_u16_le_static(file)?;
275
276            // Skip modification time, date, CRC-32
277            file.seek(SeekFrom::Current(8))?;
278
279            // Read sizes as 32-bit placeholders (may be 0xFFFFFFFF meaning ZIP64)
280            let compressed_size_32 = Self::read_u32_le_static(file)? as u64;
281            let uncompressed_size_32 = Self::read_u32_le_static(file)? as u64;
282            let filename_len = Self::read_u16_le_static(file)? as usize;
283            let extra_len = Self::read_u16_le_static(file)? as usize;
284            let comment_len = Self::read_u16_le_static(file)? as usize;
285
286            // Skip disk number, internal attributes, external attributes
287            file.seek(SeekFrom::Current(8))?;
288
289            let mut offset = Self::read_u32_le_static(file)? as u64;
290
291            // Read filename
292            let mut filename_buf = vec![0u8; filename_len];
293            file.read_exact(&mut filename_buf)?;
294            let name = String::from_utf8_lossy(&filename_buf).to_string();
295
296            // Read extra field so we can parse ZIP64 extra if present
297            let mut extra_buf = vec![0u8; extra_len];
298            if extra_len > 0 {
299                file.read_exact(&mut extra_buf)?;
300            }
301
302            // If sizes/offsets are 0xFFFFFFFF, parse ZIP64 extra field (0x0001)
303            let mut compressed_size = compressed_size_32;
304            let mut uncompressed_size = uncompressed_size_32;
305
306            if compressed_size_32 == 0xFFFFFFFF
307                || uncompressed_size_32 == 0xFFFFFFFF
308                || offset == 0xFFFFFFFF
309            {
310                // parse extra fields
311                let mut i = 0usize;
312                while i + 4 <= extra_buf.len() {
313                    let id = u16::from_le_bytes([extra_buf[i], extra_buf[i + 1]]);
314                    let data_len =
315                        u16::from_le_bytes([extra_buf[i + 2], extra_buf[i + 3]]) as usize;
316                    i += 4;
317                    if i + data_len > extra_buf.len() {
318                        break;
319                    }
320                    if id == 0x0001 {
321                        // ZIP64 extra field: contains values in order: original size, compressed size, relative header offset, disk start
322                        let mut cursor = 0usize;
323                        // read uncompressed size if placeholder present
324                        if uncompressed_size_32 == 0xFFFFFFFF && cursor + 8 <= data_len {
325                            uncompressed_size = u64::from_le_bytes([
326                                extra_buf[i + cursor],
327                                extra_buf[i + cursor + 1],
328                                extra_buf[i + cursor + 2],
329                                extra_buf[i + cursor + 3],
330                                extra_buf[i + cursor + 4],
331                                extra_buf[i + cursor + 5],
332                                extra_buf[i + cursor + 6],
333                                extra_buf[i + cursor + 7],
334                            ]);
335                            cursor += 8;
336                        }
337                        // read compressed size if placeholder present
338                        if compressed_size_32 == 0xFFFFFFFF && cursor + 8 <= data_len {
339                            compressed_size = u64::from_le_bytes([
340                                extra_buf[i + cursor],
341                                extra_buf[i + cursor + 1],
342                                extra_buf[i + cursor + 2],
343                                extra_buf[i + cursor + 3],
344                                extra_buf[i + cursor + 4],
345                                extra_buf[i + cursor + 5],
346                                extra_buf[i + cursor + 6],
347                                extra_buf[i + cursor + 7],
348                            ]);
349                            cursor += 8;
350                        }
351                        // read offset if placeholder present
352                        if offset == 0xFFFFFFFF && cursor + 8 <= data_len {
353                            offset = u64::from_le_bytes([
354                                extra_buf[i + cursor],
355                                extra_buf[i + cursor + 1],
356                                extra_buf[i + cursor + 2],
357                                extra_buf[i + cursor + 3],
358                                extra_buf[i + cursor + 4],
359                                extra_buf[i + cursor + 5],
360                                extra_buf[i + cursor + 6],
361                                extra_buf[i + cursor + 7],
362                            ]);
363                        }
364                        // we don't need disk start here
365                        break;
366                    }
367                    i += data_len;
368                }
369            }
370
371            // Skip comment
372            if comment_len > 0 {
373                file.seek(SeekFrom::Current(comment_len as i64))?;
374            }
375
376            entries.push(ZipEntry {
377                name,
378                compressed_size,
379                uncompressed_size,
380                compression_method,
381                offset,
382            });
383        }
384
385        Ok(entries)
386    }
387
388    /// When EOCD indicates ZIP64 usage, find and read ZIP64 EOCD locator and record
389    fn read_zip64_eocd(file: &mut BufReader<File>, eocd_offset: u64) -> Result<(u64, u64, u64)> {
390        // Search backwards from EOCD for ZIP64 EOCD locator signature (50 4b 06 07)
391        let search_start = eocd_offset.saturating_sub(65557);
392        file.seek(SeekFrom::Start(search_start))?;
393        let mut buffer = Vec::new();
394        file.read_to_end(&mut buffer)?;
395
396        let mut locator_pos: Option<usize> = None;
397        for i in (0..buffer.len().saturating_sub(3)).rev() {
398            if buffer[i] == 0x50
399                && buffer[i + 1] == 0x4b
400                && buffer[i + 2] == 0x06
401                && buffer[i + 3] == 0x07
402            {
403                locator_pos = Some(i);
404                break;
405            }
406        }
407
408        let locator_pos = locator_pos
409            .ok_or_else(|| SZipError::InvalidFormat("ZIP64 EOCD locator not found".to_string()))?;
410
411        // Read locator fields from buffer
412        // locator layout: signature(4), number of the disk with the start of the zip64 eocd(4), relative offset of the zip64 eocd(8), total number of disks(4)
413        let rel_off_bytes = &buffer[locator_pos + 8..locator_pos + 16];
414        let zip64_eocd_offset = u64::from_le_bytes([
415            rel_off_bytes[0],
416            rel_off_bytes[1],
417            rel_off_bytes[2],
418            rel_off_bytes[3],
419            rel_off_bytes[4],
420            rel_off_bytes[5],
421            rel_off_bytes[6],
422            rel_off_bytes[7],
423        ]);
424
425        // Seek to ZIP64 EOCD record
426        file.seek(SeekFrom::Start(zip64_eocd_offset))?;
427
428        let sig = Self::read_u32_le_static(file)?;
429        if sig != ZIP64_END_OF_CENTRAL_DIRECTORY_SIGNATURE {
430            return Err(SZipError::InvalidFormat(format!(
431                "Invalid ZIP64 EOCD signature: 0x{:08x}",
432                sig
433            )));
434        }
435
436        // size of ZIP64 EOCD record (8 bytes)
437        let _size = {
438            let mut buf = [0u8; 8];
439            file.read_exact(&mut buf)?;
440            u64::from_le_bytes(buf)
441        };
442
443        // skip version made by (2), version needed (2), disk number (4), disk where central dir starts (4)
444        file.seek(SeekFrom::Current(12))?;
445
446        // total number of entries on this disk (8)
447        let total_entries = {
448            let mut buf = [0u8; 8];
449            file.read_exact(&mut buf)?;
450            u64::from_le_bytes(buf)
451        };
452
453        // total number of entries (8) - some implementations write both; ignore the second value
454        {
455            let mut buf = [0u8; 8];
456            file.read_exact(&mut buf)?;
457            // ignore u64::from_le_bytes(buf)
458        }
459
460        // central directory size (8)
461        let cd_size = {
462            let mut buf = [0u8; 8];
463            file.read_exact(&mut buf)?;
464            u64::from_le_bytes(buf)
465        };
466
467        // central directory offset (8)
468        let cd_offset = {
469            let mut buf = [0u8; 8];
470            file.read_exact(&mut buf)?;
471            u64::from_le_bytes(buf)
472        };
473
474        Ok((total_entries, cd_size, cd_offset))
475    }
476
477    /// Find the end of central directory record by scanning from the end of the file
478    fn find_eocd(file: &mut BufReader<File>) -> Result<u64> {
479        let file_size = file.seek(SeekFrom::End(0))?;
480
481        // EOCD is at least 22 bytes, search last 65KB (max comment size + EOCD)
482        let search_start = file_size.saturating_sub(65557);
483        file.seek(SeekFrom::Start(search_start))?;
484
485        let mut buffer = Vec::new();
486        file.read_to_end(&mut buffer)?;
487
488        // Search for EOCD signature from the end
489        for i in (0..buffer.len().saturating_sub(3)).rev() {
490            if buffer[i] == 0x50
491                && buffer[i + 1] == 0x4b
492                && buffer[i + 2] == 0x05
493                && buffer[i + 3] == 0x06
494            {
495                return Ok(search_start + i as u64);
496            }
497        }
498
499        Err(SZipError::InvalidFormat(
500            "End of central directory not found".to_string(),
501        ))
502    }
503
504    fn read_u16_le(&mut self) -> Result<u16> {
505        let mut buf = [0u8; 2];
506        self.file.read_exact(&mut buf)?;
507        Ok(u16::from_le_bytes(buf))
508    }
509
510    fn read_u32_le(&mut self) -> Result<u32> {
511        let mut buf = [0u8; 4];
512        self.file.read_exact(&mut buf)?;
513        Ok(u32::from_le_bytes(buf))
514    }
515
516    fn read_u16_le_static(file: &mut BufReader<File>) -> Result<u16> {
517        let mut buf = [0u8; 2];
518        file.read_exact(&mut buf)?;
519        Ok(u16::from_le_bytes(buf))
520    }
521
522    fn read_u32_le_static(file: &mut BufReader<File>) -> Result<u32> {
523        let mut buf = [0u8; 4];
524        file.read_exact(&mut buf)?;
525        Ok(u32::from_le_bytes(buf))
526    }
527}