1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
use crate::{encoding::Encoding, error::*, format::*, reader::buffer::*, transition};

use nom::Offset;
use std::io::Read;
use tracing::trace;

/// ArchiveReader parses a valid zip archive into an [Archive][]. In particular, this struct finds
/// an end of central directory record, parses the entire central directory, detects text encoding,
/// and normalizes metadata.
pub struct ArchiveReader {
    // Size of the entire zip file
    size: u64,
    state: ArchiveReaderState,
}

pub enum ArchiveReaderResult {
    /// Indicates that [ArchiveReader][] has work left, and the loop should continue.
    Continue,
    /// Indicates that [ArchiveReader][] is done reading the central directory,
    /// contains an [Archive][]. Calling any method after [process()](ArchiveReader::process()) has returned
    /// `Done` will panic.
    Done(Archive),
}

enum ArchiveReaderState {
    /// Used while transitioning because ownership rules are tough.
    Transitioning,

    /// Finding and reading the end of central directory record
    ReadEocd { buffer: Buffer, haystack_size: u64 },

    /// Reading the zip64 end of central directory record.
    ReadEocd64Locator {
        buffer: Buffer,
        eocdr: Located<EndOfCentralDirectoryRecord>,
    },

    /// Reading the zip64 end of central directory record.
    ReadEocd64 {
        buffer: Buffer,
        eocdr64_offset: u64,
        eocdr: Located<EndOfCentralDirectoryRecord>,
    },

    /// Reading all headers from the central directory
    ReadCentralDirectory {
        buffer: Buffer,
        eocd: EndOfCentralDirectory,
        directory_headers: Vec<DirectoryHeader>,
    },

    /// Done!
    Done,
}

impl ArchiveReaderState {
    fn buffer_as_mut(&mut self) -> Option<&mut Buffer> {
        use ArchiveReaderState as S;
        match self {
            S::ReadEocd { ref mut buffer, .. } => Some(buffer),
            S::ReadEocd64Locator { ref mut buffer, .. } => Some(buffer),
            S::ReadEocd64 { ref mut buffer, .. } => Some(buffer),
            S::ReadCentralDirectory { ref mut buffer, .. } => Some(buffer),
            _ => None,
        }
    }
}

impl ArchiveReader {
    /// This should be > 65KiB, because the section at the end of the
    /// file that we check for end of central directory record is 65KiB.
    /// 128 is the next power of two.
    const DEFAULT_BUFFER_SIZE: usize = 128 * 1024;

    /// Create a new archive reader with a specified file size.
    ///
    /// Actual reading of the file is performed by calling
    /// [wants_read()](ArchiveReader::wants_read()), [read()](ArchiveReader::read()) and
    /// [process()](ArchiveReader::process()) in a loop.
    pub fn new(size: u64) -> Self {
        let haystack_size: u64 = 65 * 1024;
        let haystack_size = if size < haystack_size {
            size
        } else {
            haystack_size
        };

        Self {
            size,
            state: ArchiveReaderState::ReadEocd {
                buffer: Buffer::with_capacity(Self::DEFAULT_BUFFER_SIZE),
                haystack_size,
            },
        }
    }

    /// Returns whether or not this reader needs more data to continue.
    ///
    /// Returns `Some(offset)` if this reader needs to read some data from `offset`.
    /// In this case, [read()](ArchiveReader::read()) should be called with a [Read]
    /// at the correct offset.
    ///
    /// Returns `None` if the reader does not need data and [process()](ArchiveReader::process())
    /// can be called directly.
    pub fn wants_read(&self) -> Option<u64> {
        use ArchiveReaderState as S;
        match self.state {
            S::ReadEocd {
                ref buffer,
                haystack_size,
            } => Some(buffer.read_offset(self.size - haystack_size)),
            S::ReadEocd64Locator {
                ref buffer,
                ref eocdr,
            } => {
                let length = EndOfCentralDirectory64Locator::LENGTH as u64;
                Some(buffer.read_offset(eocdr.offset - length))
            }
            S::ReadEocd64 {
                ref buffer,
                eocdr64_offset,
                ..
            } => Some(buffer.read_offset(eocdr64_offset)),
            S::ReadCentralDirectory {
                ref buffer,
                ref eocd,
                ..
            } => Some(buffer.read_offset(eocd.directory_offset())),
            S::Done { .. } => panic!("Called wants_read() on ArchiveReader in Done state"),
            S::Transitioning => unreachable!(),
        }
    }

    /// Reads some data from `rd` into the reader's internal buffer.
    ///
    /// Any I/O errors will be returned.
    ///
    /// If successful, this returns the number of bytes read. On success,
    /// [process()](ArchiveReader::process()) should be called next.
    pub fn read(&mut self, rd: &mut dyn Read) -> Result<usize, std::io::Error> {
        if let Some(buffer) = self.state.buffer_as_mut() {
            buffer.read(rd)
        } else {
            Ok(0)
        }
    }

    /// Process buffered data
    ///
    /// Errors returned from process() are caused by invalid zip archives,
    /// unsupported format quirks, or implementation bugs - never I/O errors.
    ///
    /// A result of [ArchiveReaderResult::Continue] indicates one should loop again,
    /// starting with [wants_read()](ArchiveReader::wants_read()).
    ///
    /// A result of [ArchiveReaderResult::Done] contains the [Archive], and indicates that no
    /// method should ever be called again on this reader.
    pub fn process(&mut self) -> Result<ArchiveReaderResult, Error> {
        use ArchiveReaderResult as R;
        use ArchiveReaderState as S;
        match self.state {
            S::ReadEocd {
                ref mut buffer,
                haystack_size,
            } => {
                if buffer.read_bytes() < haystack_size {
                    return Ok(R::Continue);
                }

                match {
                    let haystack = &buffer.data()[..haystack_size as usize];
                    EndOfCentralDirectoryRecord::find_in_block(haystack)
                } {
                    None => Err(FormatError::DirectoryEndSignatureNotFound.into()),
                    Some(mut eocdr) => {
                        buffer.reset();
                        eocdr.offset += self.size - haystack_size;

                        if eocdr.offset < EndOfCentralDirectory64Locator::LENGTH as u64 {
                            // no room for an EOCD64 locator, definitely not a zip64 file
                            transition!(self.state => (S::ReadEocd { mut buffer, .. }) {
                                buffer.reset();
                                S::ReadCentralDirectory {
                                    buffer,
                                    eocd: EndOfCentralDirectory::new(self.size, eocdr, None)?,
                                    directory_headers: vec![],
                                }
                            });
                            Ok(R::Continue)
                        } else {
                            transition!(self.state => (S::ReadEocd { mut buffer, .. }) {
                                buffer.reset();
                                S::ReadEocd64Locator { buffer, eocdr }
                            });
                            Ok(R::Continue)
                        }
                    }
                }
            }
            S::ReadEocd64Locator { ref mut buffer, .. } => {
                match EndOfCentralDirectory64Locator::parse(buffer.data()) {
                    Err(nom::Err::Incomplete(_)) => {
                        // need more data
                        Ok(R::Continue)
                    }
                    Err(nom::Err::Error(_)) | Err(nom::Err::Failure(_)) => {
                        // we don't have a zip64 end of central directory locator - that's ok!
                        transition!(self.state => (S::ReadEocd64Locator { mut buffer, eocdr }) {
                            buffer.reset();
                            S::ReadCentralDirectory {
                                buffer,
                                eocd: EndOfCentralDirectory::new(self.size, eocdr, None)?,
                                directory_headers: vec![],
                            }
                        });
                        Ok(R::Continue)
                    }
                    Ok((_, locator)) => {
                        transition!(self.state => (S::ReadEocd64Locator { mut buffer, eocdr }) {
                            buffer.reset();
                            S::ReadEocd64 {
                                buffer,
                                eocdr64_offset: locator.directory_offset,
                                eocdr,
                            }
                        });
                        Ok(R::Continue)
                    }
                }
            }
            S::ReadEocd64 { ref mut buffer, .. } => {
                match EndOfCentralDirectory64Record::parse(buffer.data()) {
                    Err(nom::Err::Incomplete(_)) => {
                        // need more data
                        Ok(R::Continue)
                    }
                    Err(nom::Err::Error(_)) | Err(nom::Err::Failure(_)) => {
                        // at this point, we really expected to have a zip64 end
                        // of central directory record, so, we want to propagate
                        // that error.
                        Err(FormatError::Directory64EndRecordInvalid.into())
                    }
                    Ok((_, eocdr64)) => {
                        transition!(self.state => (S::ReadEocd64 { mut buffer, eocdr, eocdr64_offset }) {
                            buffer.reset();
                            S::ReadCentralDirectory {
                                buffer,
                                eocd: EndOfCentralDirectory::new(self.size, eocdr, Some(Located {
                                    offset: eocdr64_offset,
                                    inner: eocdr64
                                }))?,
                                directory_headers: vec![],
                            }
                        });
                        Ok(R::Continue)
                    }
                }
            }
            S::ReadCentralDirectory {
                ref mut buffer,
                ref eocd,
                ref mut directory_headers,
            } => {
                trace!(
                    "ReadCentralDirectory | process(), available: {}",
                    buffer.available_data()
                );
                'read_headers: while buffer.available_data() > 0 {
                    match DirectoryHeader::parse(buffer.data()) {
                        Err(nom::Err::Incomplete(_needed)) => {
                            // need more data
                            break 'read_headers;
                        }
                        Err(nom::Err::Error(_err)) | Err(nom::Err::Failure(_err)) => {
                            // this is the normal end condition when reading
                            // the central directory (due to 65536-entries non-zip64 files)
                            // let's just check a few numbers first.

                            // only compare 16 bits here
                            let expected_records = directory_headers.len() as u16;
                            let actual_records = eocd.directory_records() as u16;

                            if expected_records == actual_records {
                                let mut detectorng = chardetng::EncodingDetector::new();
                                let mut all_utf8 = true;
                                let mut had_suspicious_chars_for_cp437 = false;

                                {
                                    let max_feed: usize = 4096;
                                    let mut total_fed: usize = 0;
                                    let mut feed = |slice: &[u8]| {
                                        detectorng.feed(slice, false);
                                        for b in slice {
                                            if (0xB0..=0xDF).contains(b) {
                                                // those are, like, box drawing characters
                                                had_suspicious_chars_for_cp437 = true;
                                            }
                                        }

                                        total_fed += slice.len();
                                        total_fed < max_feed
                                    };

                                    'recognize_encoding: for fh in
                                        directory_headers.iter().filter(|fh| fh.is_non_utf8())
                                    {
                                        all_utf8 = false;
                                        if !feed(&fh.name.0) || !feed(&fh.comment.0) {
                                            break 'recognize_encoding;
                                        }
                                    }
                                }

                                let encoding = {
                                    if all_utf8 {
                                        Encoding::Utf8
                                    } else {
                                        let encoding = detectorng.guess(None, true);
                                        if encoding == encoding_rs::SHIFT_JIS {
                                            // well hold on, sometimes Codepage 437 is detected as
                                            // Shift-JIS by chardetng. If we have any characters
                                            // that aren't valid DOS file names, then okay it's probably
                                            // Shift-JIS. Otherwise, assume it's CP437.
                                            if had_suspicious_chars_for_cp437 {
                                                Encoding::ShiftJis
                                            } else {
                                                Encoding::Cp437
                                            }
                                        } else if encoding == encoding_rs::UTF_8 {
                                            Encoding::Utf8
                                        } else {
                                            Encoding::Cp437
                                        }
                                    }
                                };

                                let is_zip64 = eocd.dir64.is_some();
                                let global_offset = eocd.global_offset as u64;
                                let entries: Result<Vec<StoredEntry>, Error> = directory_headers
                                    .iter()
                                    .map(|x| x.as_stored_entry(is_zip64, encoding, global_offset))
                                    .collect();
                                let entries = entries?;

                                let mut comment: Option<String> = None;
                                if !eocd.comment().0.is_empty() {
                                    comment = Some(encoding.decode(&eocd.comment().0)?);
                                }

                                self.state = S::Done;
                                return Ok(R::Done(Archive {
                                    size: self.size,
                                    comment,
                                    entries,
                                    encoding,
                                }));
                            } else {
                                // if we read the wrong number of directory entries,
                                // error out.
                                return Err(FormatError::InvalidCentralRecord {
                                    expected: expected_records,
                                    actual: actual_records,
                                }
                                .into());
                            }
                        }
                        Ok((remaining, dh)) => {
                            let consumed = buffer.data().offset(remaining);
                            buffer.consume(consumed);
                            directory_headers.push(dh);
                        }
                    }
                }

                // need more data
                Ok(R::Continue)
            }
            S::Done { .. } => panic!("Called process() on ArchiveReader in Done state"),
            S::Transitioning => unreachable!(),
        }
    }
}