s_zip/
writer.rs

1//! Streaming ZIP writer that compresses data on-the-fly without temp files
2//!
3//! This eliminates:
4//! - Temp file disk I/O
5//! - File read buffers
6//! - Intermediate storage
7//!
8//! Expected RAM savings: 5-8 MB per file
9
10use crate::error::{Result, SZipError};
11use crc32fast::Hasher as Crc32;
12use flate2::write::DeflateEncoder;
13use flate2::Compression;
14use std::fs::File;
15use std::io::{Seek, Write};
16use std::path::Path;
17
18/// Entry being written to ZIP
19struct ZipEntry {
20    name: String,
21    local_header_offset: u64,
22    crc32: u32,
23    compressed_size: u64,
24    uncompressed_size: u64,
25}
26
27/// Streaming ZIP writer that compresses data on-the-fly
28pub struct StreamingZipWriter {
29    output: File,
30    entries: Vec<ZipEntry>,
31    current_entry: Option<CurrentEntry>,
32    compression_level: u32,
33}
34
35struct CurrentEntry {
36    name: String,
37    local_header_offset: u64,
38    encoder: DeflateEncoder<CrcCountingWriter>,
39}
40
41/// Writer that counts bytes and computes CRC32 while writing to output
42struct CrcCountingWriter {
43    output: File,
44    crc: Crc32,
45    uncompressed_count: u64,
46    compressed_count: u64,
47}
48
49impl CrcCountingWriter {
50    fn new(output: File) -> Self {
51        Self {
52            output,
53            crc: Crc32::new(),
54            uncompressed_count: 0,
55            compressed_count: 0,
56        }
57    }
58}
59
60impl Write for CrcCountingWriter {
61    fn write(&mut self, buf: &[u8]) -> std::io::Result<usize> {
62        // This is the compressed data being written
63        let n = self.output.write(buf)?;
64        self.compressed_count += n as u64;
65        Ok(n)
66    }
67
68    fn flush(&mut self) -> std::io::Result<()> {
69        self.output.flush()
70    }
71}
72
73impl StreamingZipWriter {
74    /// Create a new ZIP writer with default compression level (6)
75    pub fn new<P: AsRef<Path>>(path: P) -> Result<Self> {
76        Self::with_compression(path, 6)
77    }
78
79    /// Create a new ZIP writer with custom compression level (0-9)
80    pub fn with_compression<P: AsRef<Path>>(path: P, compression_level: u32) -> Result<Self> {
81        let output = File::create(path)?;
82        Ok(Self {
83            output,
84            entries: Vec::new(),
85            current_entry: None,
86            compression_level: compression_level.min(9),
87        })
88    }
89
90    /// Start a new entry (file) in the ZIP
91    pub fn start_entry(&mut self, name: &str) -> Result<()> {
92        // Finish previous entry if any
93        self.finish_current_entry()?;
94
95        let local_header_offset = self.output.stream_position()?;
96
97        // Write local file header with data descriptor flag (bit 3)
98        self.output.write_all(&[0x50, 0x4b, 0x03, 0x04])?; // signature
99        self.output.write_all(&[20, 0])?; // version needed
100        self.output.write_all(&[8, 0])?; // general purpose bit flag (bit 3 set)
101        self.output.write_all(&[8, 0])?; // compression method = deflate
102        self.output.write_all(&[0, 0, 0, 0])?; // mod time/date
103        self.output.write_all(&0u32.to_le_bytes())?; // crc32 placeholder
104        self.output.write_all(&0u32.to_le_bytes())?; // compressed size placeholder
105        self.output.write_all(&0u32.to_le_bytes())?; // uncompressed size placeholder
106        self.output.write_all(&(name.len() as u16).to_le_bytes())?;
107        self.output.write_all(&0u16.to_le_bytes())?; // extra len
108        self.output.write_all(name.as_bytes())?;
109
110        // Create encoder for this entry
111        let counting_writer = CrcCountingWriter::new(self.output.try_clone()?);
112        let encoder =
113            DeflateEncoder::new(counting_writer, Compression::new(self.compression_level));
114
115        self.current_entry = Some(CurrentEntry {
116            name: name.to_string(),
117            local_header_offset,
118            encoder,
119        });
120
121        Ok(())
122    }
123
124    /// Write uncompressed data to current entry (will be compressed on-the-fly)
125    pub fn write_data(&mut self, data: &[u8]) -> Result<()> {
126        if let Some(ref mut entry) = self.current_entry {
127            // Update CRC with uncompressed data
128            entry.encoder.get_mut().crc.update(data);
129            entry.encoder.get_mut().uncompressed_count += data.len() as u64;
130
131            // Write to encoder (compresses and writes to output)
132            entry.encoder.write_all(data)?;
133            Ok(())
134        } else {
135            Err(SZipError::InvalidFormat("No entry started".to_string()))
136        }
137    }
138
139    /// Finish current entry and write data descriptor
140    fn finish_current_entry(&mut self) -> Result<()> {
141        if let Some(entry) = self.current_entry.take() {
142            // Finish compression
143            let counting_writer = entry.encoder.finish()?;
144
145            let crc = counting_writer.crc.finalize();
146            let compressed_size = counting_writer.compressed_count;
147            let uncompressed_size = counting_writer.uncompressed_count;
148
149            // Write data descriptor
150            // signature
151            self.output.write_all(&[0x50, 0x4b, 0x07, 0x08])?;
152            self.output.write_all(&crc.to_le_bytes())?;
153            // If sizes exceed 32-bit, write 64-bit sizes (ZIP64 data descriptor)
154            if compressed_size > u32::MAX as u64 || uncompressed_size > u32::MAX as u64 {
155                self.output.write_all(&compressed_size.to_le_bytes())?;
156                self.output.write_all(&uncompressed_size.to_le_bytes())?;
157            } else {
158                self.output
159                    .write_all(&(compressed_size as u32).to_le_bytes())?;
160                self.output
161                    .write_all(&(uncompressed_size as u32).to_le_bytes())?;
162            }
163
164            // Save entry info for central directory
165            self.entries.push(ZipEntry {
166                name: entry.name,
167                local_header_offset: entry.local_header_offset,
168                crc32: crc,
169                compressed_size,
170                uncompressed_size,
171            });
172        }
173        Ok(())
174    }
175
176    /// Finish ZIP file (write central directory and close)
177    pub fn finish(mut self) -> Result<()> {
178        // Finish last entry
179        self.finish_current_entry()?;
180
181        let central_dir_offset = self.output.stream_position()?;
182
183        // Write central directory
184        for entry in &self.entries {
185            self.output.write_all(&[0x50, 0x4b, 0x01, 0x02])?; // central dir sig
186            self.output.write_all(&[20, 0])?; // version made by
187            self.output.write_all(&[20, 0])?; // version needed
188            self.output.write_all(&[8, 0])?; // general purpose bit flag (bit 3 set)
189            self.output.write_all(&[8, 0])?; // compression method
190            self.output.write_all(&[0, 0, 0, 0])?; // mod time/date
191            self.output.write_all(&entry.crc32.to_le_bytes())?;
192
193            // Write sizes (32-bit placeholders or actual values)
194            if entry.compressed_size > u32::MAX as u64 {
195                self.output.write_all(&0xFFFFFFFFu32.to_le_bytes())?;
196            } else {
197                self.output
198                    .write_all(&(entry.compressed_size as u32).to_le_bytes())?;
199            }
200
201            if entry.uncompressed_size > u32::MAX as u64 {
202                self.output.write_all(&0xFFFFFFFFu32.to_le_bytes())?;
203            } else {
204                self.output
205                    .write_all(&(entry.uncompressed_size as u32).to_le_bytes())?;
206            }
207
208            self.output
209                .write_all(&(entry.name.len() as u16).to_le_bytes())?;
210
211            // Prepare ZIP64 extra field if needed
212            let mut extra_field: Vec<u8> = Vec::new();
213            if entry.uncompressed_size > u32::MAX as u64
214                || entry.compressed_size > u32::MAX as u64
215                || entry.local_header_offset > u32::MAX as u64
216            {
217                // ZIP64 extra header ID 0x0001
218                extra_field.extend_from_slice(&0x0001u16.to_le_bytes());
219                // data size: we'll include uncompressed (8) if needed, compressed (8) if needed, and offset (8) if needed
220                let mut data: Vec<u8> = Vec::new();
221                if entry.uncompressed_size > u32::MAX as u64 {
222                    data.extend_from_slice(&entry.uncompressed_size.to_le_bytes());
223                }
224                if entry.compressed_size > u32::MAX as u64 {
225                    data.extend_from_slice(&entry.compressed_size.to_le_bytes());
226                }
227                if entry.local_header_offset > u32::MAX as u64 {
228                    data.extend_from_slice(&entry.local_header_offset.to_le_bytes());
229                }
230                extra_field.extend_from_slice(&(data.len() as u16).to_le_bytes());
231                extra_field.extend_from_slice(&data);
232            }
233
234            self.output
235                .write_all(&(extra_field.len() as u16).to_le_bytes())?; // extra len
236            self.output.write_all(&0u16.to_le_bytes())?; // file comment len
237            self.output.write_all(&0u16.to_le_bytes())?; // disk number start
238            self.output.write_all(&0u16.to_le_bytes())?; // internal attrs
239            self.output.write_all(&0u32.to_le_bytes())?; // external attrs
240
241            // local header offset (32-bit or 0xFFFFFFFF)
242            if entry.local_header_offset > u32::MAX as u64 {
243                self.output.write_all(&0xFFFFFFFFu32.to_le_bytes())?;
244            } else {
245                self.output
246                    .write_all(&(entry.local_header_offset as u32).to_le_bytes())?;
247            }
248
249            self.output.write_all(entry.name.as_bytes())?;
250            if !extra_field.is_empty() {
251                self.output.write_all(&extra_field)?;
252            }
253        }
254
255        let central_dir_size = self.output.stream_position()? - central_dir_offset;
256
257        // Determine if we need ZIP64 EOCD
258        let need_zip64 = self.entries.len() > u16::MAX as usize
259            || central_dir_size > u32::MAX as u64
260            || central_dir_offset > u32::MAX as u64;
261
262        if need_zip64 {
263            // Write ZIP64 End of Central Directory Record
264            // signature
265            self.output.write_all(&[0x50, 0x4b, 0x06, 0x06])?; // 0x06064b50
266                                                               // size of zip64 eocd record (size of remaining fields)
267                                                               // We'll write fixed-size fields: version made by(2)+version needed(2)+disk numbers(4+4)+entries on disk(8)+total entries(8)+cd size(8)+cd offset(8)
268            let zip64_eocd_size: u64 = 44;
269            self.output.write_all(&zip64_eocd_size.to_le_bytes())?;
270            // version made by, version needed
271            self.output.write_all(&[20, 0])?;
272            self.output.write_all(&[20, 0])?;
273            // disk number, disk where central dir starts
274            self.output.write_all(&0u32.to_le_bytes())?;
275            self.output.write_all(&0u32.to_le_bytes())?;
276            // entries on this disk (8)
277            self.output
278                .write_all(&(self.entries.len() as u64).to_le_bytes())?;
279            // total entries (8)
280            self.output
281                .write_all(&(self.entries.len() as u64).to_le_bytes())?;
282            // central directory size (8)
283            self.output.write_all(&central_dir_size.to_le_bytes())?;
284            // central directory offset (8)
285            self.output.write_all(&central_dir_offset.to_le_bytes())?;
286
287            // Write ZIP64 EOCD locator
288            // signature
289            self.output.write_all(&[0x50, 0x4b, 0x06, 0x07])?; // 0x07064b50
290                                                               // disk with ZIP64 EOCD (4)
291            self.output.write_all(&0u32.to_le_bytes())?;
292            // relative offset of ZIP64 EOCD (8)
293            let zip64_eocd_pos = central_dir_offset + central_dir_size; // directly after central dir
294            self.output.write_all(&zip64_eocd_pos.to_le_bytes())?;
295            // total number of disks
296            self.output.write_all(&0u32.to_le_bytes())?;
297        }
298
299        // Write end of central directory (classic)
300        self.output.write_all(&[0x50, 0x4b, 0x05, 0x06])?;
301        self.output.write_all(&0u16.to_le_bytes())?; // disk number
302        self.output.write_all(&0u16.to_le_bytes())?; // disk with central dir
303
304        // number of entries (16-bit or 0xFFFF if ZIP64 used)
305        if self.entries.len() > u16::MAX as usize {
306            self.output.write_all(&0xFFFFu16.to_le_bytes())?;
307            self.output.write_all(&0xFFFFu16.to_le_bytes())?;
308        } else {
309            self.output
310                .write_all(&(self.entries.len() as u16).to_le_bytes())?;
311            self.output
312                .write_all(&(self.entries.len() as u16).to_le_bytes())?;
313        }
314
315        // central dir size and offset (32-bit or 0xFFFFFFFF)
316        if central_dir_size > u32::MAX as u64 {
317            self.output.write_all(&0xFFFFFFFFu32.to_le_bytes())?;
318        } else {
319            self.output
320                .write_all(&(central_dir_size as u32).to_le_bytes())?;
321        }
322
323        if central_dir_offset > u32::MAX as u64 {
324            self.output.write_all(&0xFFFFFFFFu32.to_le_bytes())?;
325        } else {
326            self.output
327                .write_all(&(central_dir_offset as u32).to_le_bytes())?;
328        }
329
330        self.output.write_all(&0u16.to_le_bytes())?; // comment len
331
332        self.output.flush()?;
333        Ok(())
334    }
335}