s_zip/
reader.rs

1//! Streaming ZIP reader - reads ZIP files without loading entire central directory
2//!
3//! This is a minimal ZIP reader that can extract specific files from a ZIP archive
4//! without loading the entire central directory into memory.
5
6use crate::error::{Result, SZipError};
7use flate2::read::DeflateDecoder;
8use std::fs::File;
9use std::io::{BufReader, Read, Seek, SeekFrom};
10use std::path::Path;
11
12/// ZIP local file header signature
13const LOCAL_FILE_HEADER_SIGNATURE: u32 = 0x04034b50;
14
15/// ZIP central directory signature
16const CENTRAL_DIRECTORY_SIGNATURE: u32 = 0x02014b50;
17
18/// ZIP end of central directory signature
19const END_OF_CENTRAL_DIRECTORY_SIGNATURE: u32 = 0x06054b50;
20
21/// Entry in the ZIP central directory
22#[derive(Debug, Clone)]
23pub struct ZipEntry {
24    pub name: String,
25    pub compressed_size: u64,
26    pub uncompressed_size: u64,
27    pub compression_method: u16,
28    pub offset: u64,
29}
30
31/// Streaming ZIP archive reader
32pub struct StreamingZipReader {
33    file: BufReader<File>,
34    entries: Vec<ZipEntry>,
35}
36
37impl StreamingZipReader {
38    /// Open a ZIP file and read its central directory
39    pub fn open<P: AsRef<Path>>(path: P) -> Result<Self> {
40        let mut file = BufReader::new(File::open(path)?);
41
42        // Find and read central directory
43        let entries = Self::read_central_directory(&mut file)?;
44
45        Ok(StreamingZipReader { file, entries })
46    }
47
48    /// Get list of all entries in the ZIP
49    pub fn entries(&self) -> &[ZipEntry] {
50        &self.entries
51    }
52
53    /// Find an entry by name
54    pub fn find_entry(&self, name: &str) -> Option<&ZipEntry> {
55        self.entries.iter().find(|e| e.name == name)
56    }
57
58    /// Read an entry's decompressed data into a vector
59    pub fn read_entry(&mut self, entry: &ZipEntry) -> Result<Vec<u8>> {
60        // Seek to local file header
61        self.file.seek(SeekFrom::Start(entry.offset))?;
62
63        // Read and verify local file header
64        let signature = self.read_u32_le()?;
65        if signature != LOCAL_FILE_HEADER_SIGNATURE {
66            return Err(SZipError::InvalidFormat(
67                "Invalid local file header signature".to_string(),
68            ));
69        }
70
71        // Skip version, flags, compression method
72        self.file.seek(SeekFrom::Current(6))?;
73
74        // Skip modification time and date, CRC-32
75        self.file.seek(SeekFrom::Current(8))?;
76
77        // Read compressed and uncompressed sizes (already known from central directory)
78        self.file.seek(SeekFrom::Current(8))?;
79
80        // Read filename length and extra field length
81        let filename_len = self.read_u16_le()? as i64;
82        let extra_len = self.read_u16_le()? as i64;
83
84        // Skip filename and extra field
85        self.file
86            .seek(SeekFrom::Current(filename_len + extra_len))?;
87
88        // Now read the compressed data
89        let mut compressed_data = vec![0u8; entry.compressed_size as usize];
90        self.file.read_exact(&mut compressed_data)?;
91
92        // Decompress if needed
93        let data = if entry.compression_method == 8 {
94            // DEFLATE compression
95            let mut decoder = DeflateDecoder::new(&compressed_data[..]);
96            let mut decompressed = Vec::new();
97            decoder.read_to_end(&mut decompressed)?;
98            decompressed
99        } else if entry.compression_method == 0 {
100            // No compression (stored)
101            compressed_data
102        } else {
103            return Err(SZipError::UnsupportedCompression(entry.compression_method));
104        };
105
106        Ok(data)
107    }
108
109    /// Read an entry by name
110    pub fn read_entry_by_name(&mut self, name: &str) -> Result<Vec<u8>> {
111        let entry = self
112            .find_entry(name)
113            .ok_or_else(|| SZipError::EntryNotFound(name.to_string()))?
114            .clone();
115
116        self.read_entry(&entry)
117    }
118
119    /// Get a streaming reader for an entry by name (for large files)
120    /// Returns a reader that decompresses data on-the-fly without loading everything into memory
121    pub fn read_entry_streaming_by_name(&mut self, name: &str) -> Result<Box<dyn Read + '_>> {
122        let entry = self
123            .find_entry(name)
124            .ok_or_else(|| SZipError::EntryNotFound(name.to_string()))?
125            .clone();
126
127        self.read_entry_streaming(&entry)
128    }
129
130    /// Get a streaming reader for an entry (for large files)
131    /// Returns a reader that decompresses data on-the-fly without loading everything into memory
132    pub fn read_entry_streaming(&mut self, entry: &ZipEntry) -> Result<Box<dyn Read + '_>> {
133        // Seek to local file header
134        self.file.seek(SeekFrom::Start(entry.offset))?;
135
136        // Read and verify local file header
137        let signature = self.read_u32_le()?;
138        if signature != LOCAL_FILE_HEADER_SIGNATURE {
139            return Err(SZipError::InvalidFormat(
140                "Invalid local file header signature".to_string(),
141            ));
142        }
143
144        // Skip version, flags, compression method
145        self.file.seek(SeekFrom::Current(6))?;
146
147        // Skip modification time and date, CRC-32
148        self.file.seek(SeekFrom::Current(8))?;
149
150        // Read compressed and uncompressed sizes
151        self.file.seek(SeekFrom::Current(8))?;
152
153        // Read filename length and extra field length
154        let filename_len = self.read_u16_le()? as i64;
155        let extra_len = self.read_u16_le()? as i64;
156
157        // Skip filename and extra field
158        self.file
159            .seek(SeekFrom::Current(filename_len + extra_len))?;
160
161        // Create a reader limited to compressed data size
162        let limited_reader = (&mut self.file).take(entry.compressed_size);
163
164        // Wrap with decompressor if needed
165        if entry.compression_method == 8 {
166            // DEFLATE compression
167            Ok(Box::new(DeflateDecoder::new(limited_reader)))
168        } else if entry.compression_method == 0 {
169            // No compression (stored)
170            Ok(Box::new(limited_reader))
171        } else {
172            Err(SZipError::UnsupportedCompression(entry.compression_method))
173        }
174    }
175
176    /// Get a streaming reader for an entry by name
177    pub fn read_entry_by_name_streaming(&mut self, name: &str) -> Result<Box<dyn Read + '_>> {
178        let entry = self
179            .find_entry(name)
180            .ok_or_else(|| SZipError::EntryNotFound(name.to_string()))?
181            .clone();
182
183        self.read_entry_streaming(&entry)
184    }
185
186    /// Read the central directory from the ZIP file
187    fn read_central_directory(file: &mut BufReader<File>) -> Result<Vec<ZipEntry>> {
188        // Find end of central directory record
189        let eocd_offset = Self::find_eocd(file)?;
190
191        // Seek to EOCD
192        file.seek(SeekFrom::Start(eocd_offset))?;
193
194        // Read EOCD
195        let signature = Self::read_u32_le_static(file)?;
196        if signature != END_OF_CENTRAL_DIRECTORY_SIGNATURE {
197            return Err(SZipError::InvalidFormat(format!(
198                "Invalid end of central directory signature: 0x{:08x}",
199                signature
200            )));
201        }
202
203        // Skip disk number fields (4 bytes)
204        file.seek(SeekFrom::Current(4))?;
205
206        // Read number of entries on this disk (2 bytes)
207        let _entries_on_disk = Self::read_u16_le_static(file)?;
208
209        // Read total number of entries (2 bytes)
210        let total_entries = Self::read_u16_le_static(file)? as usize;
211
212        // Read central directory size (4 bytes)
213        let _cd_size = Self::read_u32_le_static(file)?;
214
215        // Read central directory offset (4 bytes)
216        let cd_offset = Self::read_u32_le_static(file)? as u64;
217
218        // Seek to central directory
219        file.seek(SeekFrom::Start(cd_offset))?;
220
221        // Read all central directory entries
222        let mut entries = Vec::with_capacity(total_entries);
223        for _ in 0..total_entries {
224            let signature = Self::read_u32_le_static(file)?;
225            if signature != CENTRAL_DIRECTORY_SIGNATURE {
226                break;
227            }
228
229            // Skip version made by, version needed, flags
230            file.seek(SeekFrom::Current(6))?;
231
232            let compression_method = Self::read_u16_le_static(file)?;
233
234            // Skip modification time, date, CRC-32
235            file.seek(SeekFrom::Current(8))?;
236
237            let compressed_size = Self::read_u32_le_static(file)? as u64;
238            let uncompressed_size = Self::read_u32_le_static(file)? as u64;
239            let filename_len = Self::read_u16_le_static(file)? as usize;
240            let extra_len = Self::read_u16_le_static(file)? as usize;
241            let comment_len = Self::read_u16_le_static(file)? as usize;
242
243            // Skip disk number, internal attributes, external attributes
244            file.seek(SeekFrom::Current(8))?;
245
246            let offset = Self::read_u32_le_static(file)? as u64;
247
248            // Read filename
249            let mut filename_buf = vec![0u8; filename_len];
250            file.read_exact(&mut filename_buf)?;
251            let name = String::from_utf8_lossy(&filename_buf).to_string();
252
253            // Skip extra field and comment
254            file.seek(SeekFrom::Current((extra_len + comment_len) as i64))?;
255
256            entries.push(ZipEntry {
257                name,
258                compressed_size,
259                uncompressed_size,
260                compression_method,
261                offset,
262            });
263        }
264
265        Ok(entries)
266    }
267
268    /// Find the end of central directory record by scanning from the end of the file
269    fn find_eocd(file: &mut BufReader<File>) -> Result<u64> {
270        let file_size = file.seek(SeekFrom::End(0))?;
271
272        // EOCD is at least 22 bytes, search last 65KB (max comment size + EOCD)
273        let search_start = file_size.saturating_sub(65557);
274        file.seek(SeekFrom::Start(search_start))?;
275
276        let mut buffer = Vec::new();
277        file.read_to_end(&mut buffer)?;
278
279        // Search for EOCD signature from the end
280        for i in (0..buffer.len().saturating_sub(3)).rev() {
281            if buffer[i] == 0x50
282                && buffer[i + 1] == 0x4b
283                && buffer[i + 2] == 0x05
284                && buffer[i + 3] == 0x06
285            {
286                return Ok(search_start + i as u64);
287            }
288        }
289
290        Err(SZipError::InvalidFormat(
291            "End of central directory not found".to_string(),
292        ))
293    }
294
295    fn read_u16_le(&mut self) -> Result<u16> {
296        let mut buf = [0u8; 2];
297        self.file.read_exact(&mut buf)?;
298        Ok(u16::from_le_bytes(buf))
299    }
300
301    fn read_u32_le(&mut self) -> Result<u32> {
302        let mut buf = [0u8; 4];
303        self.file.read_exact(&mut buf)?;
304        Ok(u32::from_le_bytes(buf))
305    }
306
307    fn read_u16_le_static(file: &mut BufReader<File>) -> Result<u16> {
308        let mut buf = [0u8; 2];
309        file.read_exact(&mut buf)?;
310        Ok(u16::from_le_bytes(buf))
311    }
312
313    fn read_u32_le_static(file: &mut BufReader<File>) -> Result<u32> {
314        let mut buf = [0u8; 4];
315        file.read_exact(&mut buf)?;
316        Ok(u32::from_le_bytes(buf))
317    }
318}