Skip to main content

hwpforge_smithy_hwpx/decoder/
package.rs

1//! ZIP package reader for HWPX files.
2//!
3//! [`PackageReader`] wraps a `ZipArchive` and provides safe access
4//! to the files inside an HWPX document archive.
5
6use std::io::{Cursor, Read};
7
8use zip::ZipArchive;
9
10use crate::error::{HwpxError, HwpxResult};
11
12// ── Safety limits ────────────────────────────────────────────────
13
14/// Maximum decompressed size of a single entry (50 MB).
15const MAX_ENTRY_SIZE: u64 = 50 * 1024 * 1024;
16
17/// Maximum total decompressed size across all entries (500 MB).
18const MAX_TOTAL_SIZE: u64 = 500 * 1024 * 1024;
19
20/// Maximum number of entries in the archive.
21const MAX_ENTRIES: usize = 10_000;
22
23// ── HWPX constants ───────────────────────────────────────────────
24
25/// Accepted mimetype values (first entry in ZIP, uncompressed).
26const ACCEPTED_MIMETYPES: &[&str] =
27    &["application/hwp+zip", "application/haansofthwp+zip", "application/vnd.hancom.hwp+zip"];
28
29/// Path to the mimetype file inside the ZIP.
30const MIMETYPE_PATH: &str = "mimetype";
31
32/// Path to the header XML inside the ZIP.
33const HEADER_PATH: &str = "Contents/header.xml";
34
35/// Prefix for section XML files inside the ZIP.
36const SECTION_PREFIX: &str = "Contents/section";
37
38/// Suffix for section XML files inside the ZIP.
39const SECTION_SUFFIX: &str = ".xml";
40
41// ── PackageReader ────────────────────────────────────────────────
42
43/// Reader for HWPX ZIP archives.
44///
45/// Validates structure and provides access to individual XML files
46/// within the archive. Enforces safety limits on decompressed data
47/// to prevent ZIP bomb attacks.
48pub struct PackageReader<'a> {
49    archive: ZipArchive<Cursor<&'a [u8]>>,
50    section_count: usize,
51    /// Cumulative bytes decompressed so far.
52    total_read: u64,
53}
54
55/// Metadata about a single ZIP entry inside an HWPX package.
56#[derive(Debug, Clone, PartialEq, Eq)]
57pub struct PackageEntryInfo {
58    /// Full archive path (for example, `Contents/section0.xml`).
59    pub path: String,
60    /// Uncompressed size reported by the ZIP entry header.
61    pub size: u64,
62    /// Compressed size reported by the ZIP entry header.
63    pub compressed_size: u64,
64}
65
66impl<'a> PackageReader<'a> {
67    /// Opens an HWPX archive from raw bytes.
68    ///
69    /// Validates:
70    /// - The bytes form a valid ZIP archive
71    /// - The entry count is within safety limits
72    /// - A `mimetype` file exists with an accepted value
73    pub fn new(bytes: &'a [u8]) -> HwpxResult<Self> {
74        let cursor = Cursor::new(bytes);
75        let archive = ZipArchive::new(cursor).map_err(|e| HwpxError::Zip(e.to_string()))?;
76
77        if archive.len() > MAX_ENTRIES {
78            return Err(HwpxError::InvalidStructure {
79                detail: format!(
80                    "archive has {} entries, exceeds limit of {}",
81                    archive.len(),
82                    MAX_ENTRIES,
83                ),
84            });
85        }
86
87        // Count section files
88        let section_count = archive
89            .file_names()
90            .filter(|name| name.starts_with(SECTION_PREFIX) && name.ends_with(SECTION_SUFFIX))
91            .count();
92
93        let mut reader = Self { archive, section_count, total_read: 0 };
94
95        // Validate mimetype
96        reader.validate_mimetype()?;
97
98        Ok(reader)
99    }
100
101    /// Validates the `mimetype` file in the archive.
102    fn validate_mimetype(&mut self) -> HwpxResult<()> {
103        let content = self.read_entry(MIMETYPE_PATH)?;
104        let trimmed = content.trim();
105
106        if !ACCEPTED_MIMETYPES.contains(&trimmed) {
107            return Err(HwpxError::InvalidMimetype { actual: trimmed.to_string() });
108        }
109
110        Ok(())
111    }
112
113    /// Returns the raw XML content of `Contents/header.xml`.
114    pub fn read_header_xml(&mut self) -> HwpxResult<String> {
115        self.read_entry(HEADER_PATH)
116    }
117
118    /// Returns the raw XML content of `Contents/section{index}.xml`.
119    ///
120    /// Sections are zero-indexed: section 0, section 1, etc.
121    pub fn read_section_xml(&mut self, index: usize) -> HwpxResult<String> {
122        let path = format!("{}{}{}", SECTION_PREFIX, index, SECTION_SUFFIX);
123        self.read_entry(&path)
124    }
125
126    /// Returns the number of section files found in the archive.
127    pub fn section_count(&self) -> usize {
128        self.section_count
129    }
130
131    /// Returns metadata for every entry in the archive.
132    ///
133    /// Entries are returned in ZIP order so callers can compare package
134    /// structure directly against a fixture.
135    pub fn list_entries(&mut self) -> HwpxResult<Vec<PackageEntryInfo>> {
136        let mut entries = Vec::with_capacity(self.archive.len());
137        for index in 0..self.archive.len() {
138            let file = self.archive.by_index(index).map_err(|e| HwpxError::Zip(e.to_string()))?;
139            entries.push(PackageEntryInfo {
140                path: file.name().to_string(),
141                size: file.size(),
142                compressed_size: file.compressed_size(),
143            });
144        }
145        Ok(entries)
146    }
147
148    /// Reads an arbitrary archive entry as UTF-8 text.
149    ///
150    /// This is primarily useful for package-census tooling that needs raw
151    /// access to files such as `Contents/content.hpf`.
152    pub fn read_text_entry(&mut self, path: &str) -> HwpxResult<String> {
153        self.read_entry(path)
154    }
155
156    /// Reads all `Contents/masterpage*.xml` entries from the archive.
157    ///
158    /// Returns a map from masterpage index to XML content.
159    /// E.g., `{0: "<masterPage>...</masterPage>"}` for `Contents/masterpage0.xml`.
160    pub fn read_masterpage_xmls(&mut self) -> HwpxResult<std::collections::HashMap<usize, String>> {
161        let mp_paths: Vec<(usize, String)> = self
162            .archive
163            .file_names()
164            .filter_map(|name| {
165                let stripped = name.strip_prefix("Contents/masterpage")?;
166                let idx_str = stripped.strip_suffix(".xml")?;
167                let idx: usize = idx_str.parse().ok()?;
168                Some((idx, name.to_string()))
169            })
170            .collect();
171
172        let mut result = std::collections::HashMap::new();
173        for (idx, path) in mp_paths {
174            let xml = self.read_entry(&path)?;
175            result.insert(idx, xml);
176        }
177        Ok(result)
178    }
179
180    /// Reads all `Chart/*.xml` entries from the archive into a map.
181    ///
182    /// Each entry's full path (e.g. `"Chart/chart1.xml"`) becomes the key,
183    /// and the XML string becomes the value.
184    pub fn read_chart_xmls(&mut self) -> HwpxResult<std::collections::HashMap<String, String>> {
185        let chart_paths: Vec<String> = self
186            .archive
187            .file_names()
188            .filter(|name| name.starts_with("Chart/") && name.ends_with(".xml"))
189            .map(|s| s.to_string())
190            .collect();
191
192        let mut map = std::collections::HashMap::new();
193        for path in chart_paths {
194            let xml = self.read_entry(&path)?;
195            map.insert(path, xml);
196        }
197        Ok(map)
198    }
199
200    /// Reads all `BinData/*` entries from the archive into an
201    /// [`hwpforge_core::image::ImageStore`].
202    ///
203    /// Each entry's filename (without the `BinData/` prefix) becomes the
204    /// key in the store, and the raw bytes become the value.
205    ///
206    /// Keys are sanitized to prevent path traversal (CWE-22): `..` components
207    /// and leading slashes are stripped before insertion.
208    pub fn read_all_bindata(&mut self) -> HwpxResult<hwpforge_core::image::ImageStore> {
209        let bindata_paths: Vec<String> = self
210            .archive
211            .file_names()
212            .filter(|name| name.starts_with("BinData/") && name.len() > "BinData/".len())
213            .map(|s| s.to_string())
214            .collect();
215
216        let mut store = hwpforge_core::image::ImageStore::new();
217
218        for path in bindata_paths {
219            let data = self.read_binary_entry(&path)?;
220            let raw_key = path.strip_prefix("BinData/").unwrap_or(&path);
221            // Sanitize to prevent path traversal: strip ".." and leading slashes.
222            let key = sanitize_bindata_key(raw_key);
223            if !key.is_empty() {
224                store.insert(&key, data);
225            }
226        }
227
228        Ok(store)
229    }
230
231    /// Reads a single entry from the archive as raw bytes.
232    ///
233    /// Similar to [`read_entry`] but returns `Vec<u8>` instead of `String`.
234    fn read_binary_entry(&mut self, path: &str) -> HwpxResult<Vec<u8>> {
235        let file = self
236            .archive
237            .by_name(path)
238            .map_err(|_| HwpxError::MissingFile { path: path.to_string() })?;
239
240        let hint = file.size().min(MAX_ENTRY_SIZE) as usize;
241        let mut limited = file.take(MAX_ENTRY_SIZE + 1);
242
243        let mut buf = Vec::with_capacity(hint);
244        std::io::Read::read_to_end(&mut limited, &mut buf)
245            .map_err(|e| HwpxError::Zip(format!("read '{}': {}", path, e)))?;
246
247        if buf.len() as u64 > MAX_ENTRY_SIZE {
248            return Err(HwpxError::InvalidStructure {
249                detail: format!(
250                    "entry '{}' decompressed to {} bytes, exceeds limit of {}",
251                    path,
252                    buf.len(),
253                    MAX_ENTRY_SIZE,
254                ),
255            });
256        }
257
258        self.total_read += buf.len() as u64;
259        if self.total_read > MAX_TOTAL_SIZE {
260            return Err(HwpxError::InvalidStructure {
261                detail: format!(
262                    "total decompressed data ({} bytes) exceeds limit of {}",
263                    self.total_read, MAX_TOTAL_SIZE,
264                ),
265            });
266        }
267
268        Ok(buf)
269    }
270
271    /// Reads a single entry from the archive as a UTF-8 string.
272    ///
273    /// Uses `Read::take()` to enforce the per-entry size limit regardless
274    /// of what the ZIP central directory reports (defense against ZIP bombs).
275    fn read_entry(&mut self, path: &str) -> HwpxResult<String> {
276        let file = self
277            .archive
278            .by_name(path)
279            .map_err(|_| HwpxError::MissingFile { path: path.to_string() })?;
280
281        // Use take() to enforce actual decompressed size limit.
282        // file.size() comes from the ZIP header and can be spoofed,
283        // so we cap the reader itself to MAX_ENTRY_SIZE + 1 bytes.
284        let hint = file.size().min(MAX_ENTRY_SIZE) as usize;
285        let mut limited = file.take(MAX_ENTRY_SIZE + 1);
286
287        let mut buf = String::with_capacity(hint);
288        limited
289            .read_to_string(&mut buf)
290            .map_err(|e| HwpxError::Zip(format!("read '{}': {}", path, e)))?;
291
292        if buf.len() as u64 > MAX_ENTRY_SIZE {
293            return Err(HwpxError::InvalidStructure {
294                detail: format!(
295                    "entry '{}' decompressed to {} bytes, exceeds limit of {}",
296                    path,
297                    buf.len(),
298                    MAX_ENTRY_SIZE,
299                ),
300            });
301        }
302
303        // Enforce cumulative budget
304        self.total_read += buf.len() as u64;
305        if self.total_read > MAX_TOTAL_SIZE {
306            return Err(HwpxError::InvalidStructure {
307                detail: format!(
308                    "total decompressed data ({} bytes) exceeds limit of {}",
309                    self.total_read, MAX_TOTAL_SIZE,
310                ),
311            });
312        }
313
314        Ok(buf)
315    }
316}
317
318/// Sanitizes a BinData filename to prevent path traversal (CWE-22).
319///
320/// Strips leading slashes and rejects `..` path components, matching the
321/// encoder-side `sanitize_zip_entry_name` logic.
322fn sanitize_bindata_key(name: &str) -> String {
323    name.split('/').filter(|c| !c.is_empty() && *c != "..").collect::<Vec<_>>().join("/")
324}
325
326impl std::fmt::Debug for PackageReader<'_> {
327    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
328        f.debug_struct("PackageReader")
329            .field("entries", &self.archive.len())
330            .field("sections", &self.section_count)
331            .field("total_read", &self.total_read)
332            .finish()
333    }
334}
335
336#[cfg(test)]
337mod tests {
338    use super::*;
339    use std::io::Write;
340    use zip::write::SimpleFileOptions;
341    use zip::ZipWriter;
342
343    /// Helper: creates a minimal valid HWPX ZIP in memory.
344    fn make_hwpx_zip(mimetype: &str, header_xml: &str, sections: &[&str]) -> Vec<u8> {
345        let buf = Vec::new();
346        let mut zip = ZipWriter::new(Cursor::new(buf));
347        let opts = SimpleFileOptions::default();
348
349        // mimetype must be first entry, stored (not compressed)
350        let stored =
351            SimpleFileOptions::default().compression_method(zip::CompressionMethod::Stored);
352        zip.start_file("mimetype", stored).unwrap();
353        zip.write_all(mimetype.as_bytes()).unwrap();
354
355        // header.xml
356        zip.start_file("Contents/header.xml", opts).unwrap();
357        zip.write_all(header_xml.as_bytes()).unwrap();
358
359        // section files
360        for (i, content) in sections.iter().enumerate() {
361            let path = format!("Contents/section{}.xml", i);
362            zip.start_file(&path, opts).unwrap();
363            zip.write_all(content.as_bytes()).unwrap();
364        }
365
366        zip.finish().unwrap().into_inner()
367    }
368
369    const MINIMAL_HEADER: &str =
370        r#"<?xml version="1.0" encoding="UTF-8"?><head version="1.4" secCnt="1"></head>"#;
371
372    const MINIMAL_SECTION: &str = r#"<?xml version="1.0" encoding="UTF-8"?><sec></sec>"#;
373
374    // ── Construction ─────────────────────────────────────────────
375
376    #[test]
377    fn new_valid_hwpx() {
378        let bytes = make_hwpx_zip("application/hwp+zip", MINIMAL_HEADER, &[MINIMAL_SECTION]);
379        let reader = PackageReader::new(&bytes).unwrap();
380        assert_eq!(reader.section_count(), 1);
381    }
382
383    #[test]
384    fn new_alternative_mimetype() {
385        let bytes =
386            make_hwpx_zip("application/haansofthwp+zip", MINIMAL_HEADER, &[MINIMAL_SECTION]);
387        assert!(PackageReader::new(&bytes).is_ok());
388    }
389
390    #[test]
391    fn new_vnd_mimetype() {
392        let bytes =
393            make_hwpx_zip("application/vnd.hancom.hwp+zip", MINIMAL_HEADER, &[MINIMAL_SECTION]);
394        assert!(PackageReader::new(&bytes).is_ok());
395    }
396
397    #[test]
398    fn new_not_a_zip() {
399        let err = PackageReader::new(b"not a zip file").unwrap_err();
400        assert!(matches!(err, HwpxError::Zip(_)));
401    }
402
403    #[test]
404    fn new_wrong_mimetype() {
405        let bytes = make_hwpx_zip("application/pdf", MINIMAL_HEADER, &[MINIMAL_SECTION]);
406        let err = PackageReader::new(&bytes).unwrap_err();
407        match err {
408            HwpxError::InvalidMimetype { actual } => {
409                assert_eq!(actual, "application/pdf");
410            }
411            _ => panic!("expected InvalidMimetype, got: {err:?}"),
412        }
413    }
414
415    #[test]
416    fn new_empty_zip_missing_mimetype() {
417        let buf = Vec::new();
418        let zip = ZipWriter::new(Cursor::new(buf));
419        let bytes = zip.finish().unwrap().into_inner();
420        let err = PackageReader::new(&bytes).unwrap_err();
421        assert!(matches!(err, HwpxError::MissingFile { .. }));
422    }
423
424    // ── Reading entries ──────────────────────────────────────────
425
426    #[test]
427    fn read_header_xml() {
428        let bytes = make_hwpx_zip("application/hwp+zip", MINIMAL_HEADER, &[MINIMAL_SECTION]);
429        let mut reader = PackageReader::new(&bytes).unwrap();
430        let xml = reader.read_header_xml().unwrap();
431        assert!(xml.contains("head"));
432    }
433
434    #[test]
435    fn read_section_xml_index_0() {
436        let bytes = make_hwpx_zip("application/hwp+zip", MINIMAL_HEADER, &[MINIMAL_SECTION]);
437        let mut reader = PackageReader::new(&bytes).unwrap();
438        let xml = reader.read_section_xml(0).unwrap();
439        assert!(xml.contains("sec"));
440    }
441
442    #[test]
443    fn read_section_xml_out_of_range() {
444        let bytes = make_hwpx_zip("application/hwp+zip", MINIMAL_HEADER, &[MINIMAL_SECTION]);
445        let mut reader = PackageReader::new(&bytes).unwrap();
446        let err = reader.read_section_xml(99).unwrap_err();
447        assert!(matches!(err, HwpxError::MissingFile { .. }));
448    }
449
450    #[test]
451    fn multiple_sections() {
452        let s0 = r#"<sec>section0</sec>"#;
453        let s1 = r#"<sec>section1</sec>"#;
454        let s2 = r#"<sec>section2</sec>"#;
455        let bytes = make_hwpx_zip("application/hwp+zip", MINIMAL_HEADER, &[s0, s1, s2]);
456        let mut reader = PackageReader::new(&bytes).unwrap();
457        assert_eq!(reader.section_count(), 3);
458        assert!(reader.read_section_xml(0).unwrap().contains("section0"));
459        assert!(reader.read_section_xml(1).unwrap().contains("section1"));
460        assert!(reader.read_section_xml(2).unwrap().contains("section2"));
461    }
462
463    // ── Debug impl ───────────────────────────────────────────────
464
465    #[test]
466    fn debug_impl() {
467        let bytes = make_hwpx_zip("application/hwp+zip", MINIMAL_HEADER, &[MINIMAL_SECTION]);
468        let reader = PackageReader::new(&bytes).unwrap();
469        let dbg = format!("{reader:?}");
470        assert!(dbg.contains("PackageReader"));
471        assert!(dbg.contains("sections: 1"));
472    }
473
474    // ── Mimetype trimming ────────────────────────────────────────
475
476    #[test]
477    fn mimetype_with_trailing_whitespace() {
478        let bytes = make_hwpx_zip("application/hwp+zip  \n", MINIMAL_HEADER, &[MINIMAL_SECTION]);
479        assert!(PackageReader::new(&bytes).is_ok());
480    }
481
482    // ── sanitize_bindata_key ──────────────────────────────────────
483
484    #[test]
485    fn sanitize_bindata_key_strips_traversal() {
486        assert_eq!(sanitize_bindata_key("../../../etc/passwd"), "etc/passwd");
487        assert_eq!(sanitize_bindata_key("BinData/../secret"), "BinData/secret");
488        assert_eq!(sanitize_bindata_key("image.png"), "image.png");
489        assert_eq!(sanitize_bindata_key(".."), "");
490        assert_eq!(sanitize_bindata_key("a/../../b"), "a/b");
491        assert_eq!(sanitize_bindata_key("//double//slash"), "double/slash");
492    }
493}