Skip to main content

s_zip/
reader.rs

1//! Streaming ZIP reader - reads ZIP files without loading entire central directory
2//!
3//! This is a minimal ZIP reader that can extract specific files from a ZIP archive
4//! without loading the entire central directory into memory.
5
6use crate::error::{Result, SZipError};
7use flate2::read::DeflateDecoder;
8use std::fs::File;
9use std::io::{BufReader, Read, Seek, SeekFrom};
10use std::path::Path;
11
12/// ZIP local file header signature
13const LOCAL_FILE_HEADER_SIGNATURE: u32 = 0x04034b50;
14
15/// ZIP central directory signature
16const CENTRAL_DIRECTORY_SIGNATURE: u32 = 0x02014b50;
17
18/// ZIP end of central directory signature
19const END_OF_CENTRAL_DIRECTORY_SIGNATURE: u32 = 0x06054b50;
20
21/// ZIP64 end of central directory record signature
22const ZIP64_END_OF_CENTRAL_DIRECTORY_SIGNATURE: u32 = 0x06064b50;
23
24// ZIP64 end of central directory locator signature (not used as a u32 constant)
25
26/// Entry in the ZIP central directory
27#[derive(Debug, Clone)]
28pub struct ZipEntry {
29    pub name: String,
30    pub compressed_size: u64,
31    pub uncompressed_size: u64,
32    pub compression_method: u16,
33    pub offset: u64,
34}
35
36/// Streaming ZIP archive reader with adaptive buffering
37pub struct StreamingZipReader {
38    file: BufReader<File>,
39    entries: Vec<ZipEntry>,
40}
41
42impl StreamingZipReader {
43    /// Open a ZIP file and read its central directory with default buffer size
44    pub fn open<P: AsRef<Path>>(path: P) -> Result<Self> {
45        Self::open_with_buffer_size(path, None)
46    }
47
48    /// Open a ZIP file with custom buffer size for optimized reading
49    ///
50    /// Providing a buffer size hint can improve read performance:
51    /// - Small ZIPs (<10MB): 32KB buffer
52    /// - Medium ZIPs (<100MB): 128KB buffer  
53    /// - Large ZIPs (≥100MB): 512KB buffer (default)
54    ///
55    /// # Example
56    /// ```no_run
57    /// # use s_zip::StreamingZipReader;
58    /// # fn example() -> Result<(), Box<dyn std::error::Error>> {
59    /// // Optimize for large ZIP files
60    /// let reader = StreamingZipReader::open_with_buffer_size(
61    ///     "large_archive.zip",
62    ///     Some(1024 * 1024) // 1MB buffer for very large files
63    /// )?;
64    /// # Ok(())
65    /// # }
66    /// ```
67    pub fn open_with_buffer_size<P: AsRef<Path>>(
68        path: P,
69        buffer_size: Option<usize>,
70    ) -> Result<Self> {
71        let file = File::open(path)?;
72
73        // Use adaptive buffer size
74        let buf_size = buffer_size.unwrap_or(512 * 1024); // Default 512KB
75        let mut file = BufReader::with_capacity(buf_size, file);
76
77        // Find and read central directory
78        let entries = Self::read_central_directory(&mut file)?;
79
80        Ok(StreamingZipReader { file, entries })
81    }
82
83    /// Get list of all entries in the ZIP
84    pub fn entries(&self) -> &[ZipEntry] {
85        &self.entries
86    }
87
88    /// Find an entry by name
89    pub fn find_entry(&self, name: &str) -> Option<&ZipEntry> {
90        self.entries.iter().find(|e| e.name == name)
91    }
92
93    /// Read an entry's decompressed data into a vector
94    pub fn read_entry(&mut self, entry: &ZipEntry) -> Result<Vec<u8>> {
95        // Seek to local file header
96        self.file.seek(SeekFrom::Start(entry.offset))?;
97
98        // Read and verify local file header
99        let signature = self.read_u32_le()?;
100        if signature != LOCAL_FILE_HEADER_SIGNATURE {
101            return Err(SZipError::InvalidFormat(
102                "Invalid local file header signature".to_string(),
103            ));
104        }
105
106        // Skip version, flags, compression method
107        self.file.seek(SeekFrom::Current(6))?;
108
109        // Skip modification time and date, CRC-32
110        self.file.seek(SeekFrom::Current(8))?;
111
112        // Read compressed and uncompressed sizes (already known from central directory)
113        self.file.seek(SeekFrom::Current(8))?;
114
115        // Read filename length and extra field length
116        let filename_len = self.read_u16_le()? as i64;
117        let extra_len = self.read_u16_le()? as i64;
118
119        // Skip filename and extra field
120        self.file
121            .seek(SeekFrom::Current(filename_len + extra_len))?;
122
123        // Now read the compressed data
124        let mut compressed_data = vec![0u8; entry.compressed_size as usize];
125        self.file.read_exact(&mut compressed_data)?;
126
127        // Decompress if needed
128        let data = if entry.compression_method == 8 {
129            // DEFLATE compression
130            let mut decoder = DeflateDecoder::new(&compressed_data[..]);
131            let mut decompressed = Vec::new();
132            decoder.read_to_end(&mut decompressed)?;
133            decompressed
134        } else if entry.compression_method == 0 {
135            // No compression (stored)
136            compressed_data
137        } else if entry.compression_method == 93 {
138            // Zstd compression
139            #[cfg(feature = "zstd-support")]
140            {
141                zstd::decode_all(&compressed_data[..])?
142            }
143            #[cfg(not(feature = "zstd-support"))]
144            {
145                return Err(SZipError::UnsupportedCompression(entry.compression_method));
146            }
147        } else {
148            return Err(SZipError::UnsupportedCompression(entry.compression_method));
149        };
150
151        Ok(data)
152    }
153
154    /// Read an entry by name
155    pub fn read_entry_by_name(&mut self, name: &str) -> Result<Vec<u8>> {
156        let entry = self
157            .find_entry(name)
158            .ok_or_else(|| SZipError::EntryNotFound(name.to_string()))?
159            .clone();
160
161        self.read_entry(&entry)
162    }
163
164    /// Get a streaming reader for an entry by name (for large files)
165    /// Returns a reader that decompresses data on-the-fly without loading everything into memory
166    pub fn read_entry_streaming_by_name(&mut self, name: &str) -> Result<Box<dyn Read + '_>> {
167        let entry = self
168            .find_entry(name)
169            .ok_or_else(|| SZipError::EntryNotFound(name.to_string()))?
170            .clone();
171
172        self.read_entry_streaming(&entry)
173    }
174
175    /// Get a streaming reader for an entry (for large files)
176    /// Returns a reader that decompresses data on-the-fly without loading everything into memory
177    pub fn read_entry_streaming(&mut self, entry: &ZipEntry) -> Result<Box<dyn Read + '_>> {
178        // Seek to local file header
179        self.file.seek(SeekFrom::Start(entry.offset))?;
180
181        // Read and verify local file header
182        let signature = self.read_u32_le()?;
183        if signature != LOCAL_FILE_HEADER_SIGNATURE {
184            return Err(SZipError::InvalidFormat(
185                "Invalid local file header signature".to_string(),
186            ));
187        }
188
189        // Skip version, flags, compression method
190        self.file.seek(SeekFrom::Current(6))?;
191
192        // Skip modification time and date, CRC-32
193        self.file.seek(SeekFrom::Current(8))?;
194
195        // Read compressed and uncompressed sizes
196        self.file.seek(SeekFrom::Current(8))?;
197
198        // Read filename length and extra field length
199        let filename_len = self.read_u16_le()? as i64;
200        let extra_len = self.read_u16_le()? as i64;
201
202        // Skip filename and extra field
203        self.file
204            .seek(SeekFrom::Current(filename_len + extra_len))?;
205
206        // Create a reader limited to compressed data size
207        let limited_reader = (&mut self.file).take(entry.compressed_size);
208
209        // Wrap with decompressor if needed
210        if entry.compression_method == 8 {
211            // DEFLATE compression
212            Ok(Box::new(DeflateDecoder::new(limited_reader)))
213        } else if entry.compression_method == 0 {
214            // No compression (stored)
215            Ok(Box::new(limited_reader))
216        } else if entry.compression_method == 93 {
217            // Zstd compression
218            #[cfg(feature = "zstd-support")]
219            {
220                Ok(Box::new(zstd::Decoder::new(limited_reader)?))
221            }
222            #[cfg(not(feature = "zstd-support"))]
223            {
224                Err(SZipError::UnsupportedCompression(entry.compression_method))
225            }
226        } else {
227            Err(SZipError::UnsupportedCompression(entry.compression_method))
228        }
229    }
230
231    /// Get a streaming reader for an entry by name
232    pub fn read_entry_by_name_streaming(&mut self, name: &str) -> Result<Box<dyn Read + '_>> {
233        let entry = self
234            .find_entry(name)
235            .ok_or_else(|| SZipError::EntryNotFound(name.to_string()))?
236            .clone();
237
238        self.read_entry_streaming(&entry)
239    }
240
241    /// Read the central directory from the ZIP file
242    fn read_central_directory(file: &mut BufReader<File>) -> Result<Vec<ZipEntry>> {
243        // Find end of central directory record
244        let eocd_offset = Self::find_eocd(file)?;
245
246        // Seek to EOCD
247        file.seek(SeekFrom::Start(eocd_offset))?;
248
249        // Read EOCD
250        let signature = Self::read_u32_le_static(file)?;
251        if signature != END_OF_CENTRAL_DIRECTORY_SIGNATURE {
252            return Err(SZipError::InvalidFormat(format!(
253                "Invalid end of central directory signature: 0x{:08x}",
254                signature
255            )));
256        }
257
258        // Skip disk number fields (4 bytes)
259        file.seek(SeekFrom::Current(4))?;
260
261        // Read number of entries on this disk (2 bytes)
262        let _entries_on_disk = Self::read_u16_le_static(file)?;
263
264        // Read total number of entries (2 bytes)
265
266        // These values may be placeholder 0xFFFF/0xFFFFFFFF when ZIP64 is used
267        let total_entries_16 = Self::read_u16_le_static(file)?;
268
269        // Read central directory size (4 bytes)
270        let cd_size_32 = Self::read_u32_le_static(file)?;
271
272        // Read central directory offset (4 bytes)
273        let cd_offset_32 = Self::read_u32_le_static(file)? as u64;
274
275        // Promote to u64 and handle ZIP64 if markers present
276        let mut total_entries = total_entries_16 as usize;
277        let mut cd_offset = cd_offset_32;
278        let _cd_size = cd_size_32 as u64;
279
280        if total_entries_16 == 0xFFFF || cd_size_32 == 0xFFFFFFFF || cd_offset_32 == 0xFFFFFFFF {
281            // Need to find ZIP64 EOCD locator and read ZIP64 EOCD record
282            let (zip64_total_entries, zip64_cd_size, zip64_cd_offset) =
283                Self::read_zip64_eocd(file, eocd_offset)?;
284            total_entries = zip64_total_entries as usize;
285            cd_offset = zip64_cd_offset;
286            // _cd_size can be used if needed (zip64_cd_size)
287            let _ = zip64_cd_size;
288        }
289
290        // Seek to central directory
291        file.seek(SeekFrom::Start(cd_offset))?;
292
293        // Read all central directory entries
294        let mut entries = Vec::with_capacity(total_entries);
295        for _ in 0..total_entries {
296            let signature = Self::read_u32_le_static(file)?;
297            if signature != CENTRAL_DIRECTORY_SIGNATURE {
298                break;
299            }
300
301            // Skip version made by, version needed, flags
302            file.seek(SeekFrom::Current(6))?;
303
304            let compression_method = Self::read_u16_le_static(file)?;
305
306            // Skip modification time, date, CRC-32
307            file.seek(SeekFrom::Current(8))?;
308
309            // Read sizes as 32-bit placeholders (may be 0xFFFFFFFF meaning ZIP64)
310            let compressed_size_32 = Self::read_u32_le_static(file)? as u64;
311            let uncompressed_size_32 = Self::read_u32_le_static(file)? as u64;
312            let filename_len = Self::read_u16_le_static(file)? as usize;
313            let extra_len = Self::read_u16_le_static(file)? as usize;
314            let comment_len = Self::read_u16_le_static(file)? as usize;
315
316            // Skip disk number, internal attributes, external attributes
317            file.seek(SeekFrom::Current(8))?;
318
319            let mut offset = Self::read_u32_le_static(file)? as u64;
320
321            // Read filename
322            let mut filename_buf = vec![0u8; filename_len];
323            file.read_exact(&mut filename_buf)?;
324            let name = String::from_utf8_lossy(&filename_buf).to_string();
325
326            // Read extra field so we can parse ZIP64 extra if present
327            let mut extra_buf = vec![0u8; extra_len];
328            if extra_len > 0 {
329                file.read_exact(&mut extra_buf)?;
330            }
331
332            // If sizes/offsets are 0xFFFFFFFF, parse ZIP64 extra field (0x0001)
333            let mut compressed_size = compressed_size_32;
334            let mut uncompressed_size = uncompressed_size_32;
335
336            if compressed_size_32 == 0xFFFFFFFF
337                || uncompressed_size_32 == 0xFFFFFFFF
338                || offset == 0xFFFFFFFF
339            {
340                // parse extra fields
341                let mut i = 0usize;
342                while i + 4 <= extra_buf.len() {
343                    let id = u16::from_le_bytes([extra_buf[i], extra_buf[i + 1]]);
344                    let data_len =
345                        u16::from_le_bytes([extra_buf[i + 2], extra_buf[i + 3]]) as usize;
346                    i += 4;
347                    if i + data_len > extra_buf.len() {
348                        break;
349                    }
350                    if id == 0x0001 {
351                        // ZIP64 extra field: contains values in order: original size, compressed size, relative header offset, disk start
352                        let mut cursor = 0usize;
353                        // read uncompressed size if placeholder present
354                        if uncompressed_size_32 == 0xFFFFFFFF && cursor + 8 <= data_len {
355                            uncompressed_size = u64::from_le_bytes([
356                                extra_buf[i + cursor],
357                                extra_buf[i + cursor + 1],
358                                extra_buf[i + cursor + 2],
359                                extra_buf[i + cursor + 3],
360                                extra_buf[i + cursor + 4],
361                                extra_buf[i + cursor + 5],
362                                extra_buf[i + cursor + 6],
363                                extra_buf[i + cursor + 7],
364                            ]);
365                            cursor += 8;
366                        }
367                        // read compressed size if placeholder present
368                        if compressed_size_32 == 0xFFFFFFFF && cursor + 8 <= data_len {
369                            compressed_size = u64::from_le_bytes([
370                                extra_buf[i + cursor],
371                                extra_buf[i + cursor + 1],
372                                extra_buf[i + cursor + 2],
373                                extra_buf[i + cursor + 3],
374                                extra_buf[i + cursor + 4],
375                                extra_buf[i + cursor + 5],
376                                extra_buf[i + cursor + 6],
377                                extra_buf[i + cursor + 7],
378                            ]);
379                            cursor += 8;
380                        }
381                        // read offset if placeholder present
382                        if offset == 0xFFFFFFFF && cursor + 8 <= data_len {
383                            offset = u64::from_le_bytes([
384                                extra_buf[i + cursor],
385                                extra_buf[i + cursor + 1],
386                                extra_buf[i + cursor + 2],
387                                extra_buf[i + cursor + 3],
388                                extra_buf[i + cursor + 4],
389                                extra_buf[i + cursor + 5],
390                                extra_buf[i + cursor + 6],
391                                extra_buf[i + cursor + 7],
392                            ]);
393                        }
394                        // we don't need disk start here
395                        break;
396                    }
397                    i += data_len;
398                }
399            }
400
401            // Skip comment
402            if comment_len > 0 {
403                file.seek(SeekFrom::Current(comment_len as i64))?;
404            }
405
406            entries.push(ZipEntry {
407                name,
408                compressed_size,
409                uncompressed_size,
410                compression_method,
411                offset,
412            });
413        }
414
415        Ok(entries)
416    }
417
418    /// When EOCD indicates ZIP64 usage, find and read ZIP64 EOCD locator and record
419    fn read_zip64_eocd(file: &mut BufReader<File>, eocd_offset: u64) -> Result<(u64, u64, u64)> {
420        // Search backwards from EOCD for ZIP64 EOCD locator signature (50 4b 06 07)
421        let search_start = eocd_offset.saturating_sub(65557);
422        file.seek(SeekFrom::Start(search_start))?;
423        let mut buffer = Vec::new();
424        file.read_to_end(&mut buffer)?;
425
426        let mut locator_pos: Option<usize> = None;
427        for i in (0..buffer.len().saturating_sub(3)).rev() {
428            if buffer[i] == 0x50
429                && buffer[i + 1] == 0x4b
430                && buffer[i + 2] == 0x06
431                && buffer[i + 3] == 0x07
432            {
433                locator_pos = Some(i);
434                break;
435            }
436        }
437
438        let locator_pos = locator_pos
439            .ok_or_else(|| SZipError::InvalidFormat("ZIP64 EOCD locator not found".to_string()))?;
440
441        // Read locator fields from buffer
442        // locator layout: signature(4), number of the disk with the start of the zip64 eocd(4), relative offset of the zip64 eocd(8), total number of disks(4)
443        let rel_off_bytes = &buffer[locator_pos + 8..locator_pos + 16];
444        let zip64_eocd_offset = u64::from_le_bytes([
445            rel_off_bytes[0],
446            rel_off_bytes[1],
447            rel_off_bytes[2],
448            rel_off_bytes[3],
449            rel_off_bytes[4],
450            rel_off_bytes[5],
451            rel_off_bytes[6],
452            rel_off_bytes[7],
453        ]);
454
455        // Seek to ZIP64 EOCD record
456        file.seek(SeekFrom::Start(zip64_eocd_offset))?;
457
458        let sig = Self::read_u32_le_static(file)?;
459        if sig != ZIP64_END_OF_CENTRAL_DIRECTORY_SIGNATURE {
460            return Err(SZipError::InvalidFormat(format!(
461                "Invalid ZIP64 EOCD signature: 0x{:08x}",
462                sig
463            )));
464        }
465
466        // size of ZIP64 EOCD record (8 bytes)
467        let _size = {
468            let mut buf = [0u8; 8];
469            file.read_exact(&mut buf)?;
470            u64::from_le_bytes(buf)
471        };
472
473        // skip version made by (2), version needed (2), disk number (4), disk where central dir starts (4)
474        file.seek(SeekFrom::Current(12))?;
475
476        // total number of entries on this disk (8)
477        let total_entries = {
478            let mut buf = [0u8; 8];
479            file.read_exact(&mut buf)?;
480            u64::from_le_bytes(buf)
481        };
482
483        // total number of entries (8) - some implementations write both; ignore the second value
484        {
485            let mut buf = [0u8; 8];
486            file.read_exact(&mut buf)?;
487            // ignore u64::from_le_bytes(buf)
488        }
489
490        // central directory size (8)
491        let cd_size = {
492            let mut buf = [0u8; 8];
493            file.read_exact(&mut buf)?;
494            u64::from_le_bytes(buf)
495        };
496
497        // central directory offset (8)
498        let cd_offset = {
499            let mut buf = [0u8; 8];
500            file.read_exact(&mut buf)?;
501            u64::from_le_bytes(buf)
502        };
503
504        Ok((total_entries, cd_size, cd_offset))
505    }
506
507    /// Find the end of central directory record by scanning from the end of the file
508    fn find_eocd(file: &mut BufReader<File>) -> Result<u64> {
509        let file_size = file.seek(SeekFrom::End(0))?;
510
511        // EOCD is at least 22 bytes, search last 65KB (max comment size + EOCD)
512        let search_start = file_size.saturating_sub(65557);
513        file.seek(SeekFrom::Start(search_start))?;
514
515        let mut buffer = Vec::new();
516        file.read_to_end(&mut buffer)?;
517
518        // Search for EOCD signature from the end
519        for i in (0..buffer.len().saturating_sub(3)).rev() {
520            if buffer[i] == 0x50
521                && buffer[i + 1] == 0x4b
522                && buffer[i + 2] == 0x05
523                && buffer[i + 3] == 0x06
524            {
525                return Ok(search_start + i as u64);
526            }
527        }
528
529        Err(SZipError::InvalidFormat(
530            "End of central directory not found".to_string(),
531        ))
532    }
533
534    fn read_u16_le(&mut self) -> Result<u16> {
535        let mut buf = [0u8; 2];
536        self.file.read_exact(&mut buf)?;
537        Ok(u16::from_le_bytes(buf))
538    }
539
540    fn read_u32_le(&mut self) -> Result<u32> {
541        let mut buf = [0u8; 4];
542        self.file.read_exact(&mut buf)?;
543        Ok(u32::from_le_bytes(buf))
544    }
545
546    fn read_u16_le_static(file: &mut BufReader<File>) -> Result<u16> {
547        let mut buf = [0u8; 2];
548        file.read_exact(&mut buf)?;
549        Ok(u16::from_le_bytes(buf))
550    }
551
552    fn read_u32_le_static(file: &mut BufReader<File>) -> Result<u32> {
553        let mut buf = [0u8; 4];
554        file.read_exact(&mut buf)?;
555        Ok(u32::from_le_bytes(buf))
556    }
557}