hwpforge_smithy_hwpx/decoder/
package.rs1use std::io::{Cursor, Read};
7
8use zip::ZipArchive;
9
10use crate::error::{HwpxError, HwpxResult};
11
12const MAX_ENTRY_SIZE: u64 = 50 * 1024 * 1024;
16
17const MAX_TOTAL_SIZE: u64 = 500 * 1024 * 1024;
19
20const MAX_ENTRIES: usize = 10_000;
22
23const ACCEPTED_MIMETYPES: &[&str] =
27 &["application/hwp+zip", "application/haansofthwp+zip", "application/vnd.hancom.hwp+zip"];
28
29const MIMETYPE_PATH: &str = "mimetype";
31
32const HEADER_PATH: &str = "Contents/header.xml";
34
35const SECTION_PREFIX: &str = "Contents/section";
37
38const SECTION_SUFFIX: &str = ".xml";
40
41pub struct PackageReader<'a> {
49 archive: ZipArchive<Cursor<&'a [u8]>>,
50 section_count: usize,
51 total_read: u64,
53}
54
55#[derive(Debug, Clone, PartialEq, Eq)]
57pub struct PackageEntryInfo {
58 pub path: String,
60 pub size: u64,
62 pub compressed_size: u64,
64}
65
66impl<'a> PackageReader<'a> {
67 pub fn new(bytes: &'a [u8]) -> HwpxResult<Self> {
74 let cursor = Cursor::new(bytes);
75 let archive = ZipArchive::new(cursor).map_err(|e| HwpxError::Zip(e.to_string()))?;
76
77 if archive.len() > MAX_ENTRIES {
78 return Err(HwpxError::InvalidStructure {
79 detail: format!(
80 "archive has {} entries, exceeds limit of {}",
81 archive.len(),
82 MAX_ENTRIES,
83 ),
84 });
85 }
86
87 let section_count = archive
89 .file_names()
90 .filter(|name| name.starts_with(SECTION_PREFIX) && name.ends_with(SECTION_SUFFIX))
91 .count();
92
93 let mut reader = Self { archive, section_count, total_read: 0 };
94
95 reader.validate_mimetype()?;
97
98 Ok(reader)
99 }
100
101 fn validate_mimetype(&mut self) -> HwpxResult<()> {
103 let content = self.read_entry(MIMETYPE_PATH)?;
104 let trimmed = content.trim();
105
106 if !ACCEPTED_MIMETYPES.contains(&trimmed) {
107 return Err(HwpxError::InvalidMimetype { actual: trimmed.to_string() });
108 }
109
110 Ok(())
111 }
112
113 pub fn read_header_xml(&mut self) -> HwpxResult<String> {
115 self.read_entry(HEADER_PATH)
116 }
117
118 pub fn read_section_xml(&mut self, index: usize) -> HwpxResult<String> {
122 let path = format!("{}{}{}", SECTION_PREFIX, index, SECTION_SUFFIX);
123 self.read_entry(&path)
124 }
125
126 pub fn section_count(&self) -> usize {
128 self.section_count
129 }
130
131 pub fn list_entries(&mut self) -> HwpxResult<Vec<PackageEntryInfo>> {
136 let mut entries = Vec::with_capacity(self.archive.len());
137 for index in 0..self.archive.len() {
138 let file = self.archive.by_index(index).map_err(|e| HwpxError::Zip(e.to_string()))?;
139 entries.push(PackageEntryInfo {
140 path: file.name().to_string(),
141 size: file.size(),
142 compressed_size: file.compressed_size(),
143 });
144 }
145 Ok(entries)
146 }
147
148 pub fn read_text_entry(&mut self, path: &str) -> HwpxResult<String> {
153 self.read_entry(path)
154 }
155
156 pub fn read_masterpage_xmls(&mut self) -> HwpxResult<std::collections::HashMap<usize, String>> {
161 let mp_paths: Vec<(usize, String)> = self
162 .archive
163 .file_names()
164 .filter_map(|name| {
165 let stripped = name.strip_prefix("Contents/masterpage")?;
166 let idx_str = stripped.strip_suffix(".xml")?;
167 let idx: usize = idx_str.parse().ok()?;
168 Some((idx, name.to_string()))
169 })
170 .collect();
171
172 let mut result = std::collections::HashMap::new();
173 for (idx, path) in mp_paths {
174 let xml = self.read_entry(&path)?;
175 result.insert(idx, xml);
176 }
177 Ok(result)
178 }
179
180 pub fn read_chart_xmls(&mut self) -> HwpxResult<std::collections::HashMap<String, String>> {
185 let chart_paths: Vec<String> = self
186 .archive
187 .file_names()
188 .filter(|name| name.starts_with("Chart/") && name.ends_with(".xml"))
189 .map(|s| s.to_string())
190 .collect();
191
192 let mut map = std::collections::HashMap::new();
193 for path in chart_paths {
194 let xml = self.read_entry(&path)?;
195 map.insert(path, xml);
196 }
197 Ok(map)
198 }
199
200 pub fn read_all_bindata(&mut self) -> HwpxResult<hwpforge_core::image::ImageStore> {
209 let bindata_paths: Vec<String> = self
210 .archive
211 .file_names()
212 .filter(|name| name.starts_with("BinData/") && name.len() > "BinData/".len())
213 .map(|s| s.to_string())
214 .collect();
215
216 let mut store = hwpforge_core::image::ImageStore::new();
217
218 for path in bindata_paths {
219 let data = self.read_binary_entry(&path)?;
220 let raw_key = path.strip_prefix("BinData/").unwrap_or(&path);
221 let key = sanitize_bindata_key(raw_key);
223 if !key.is_empty() {
224 store.insert(&key, data);
225 }
226 }
227
228 Ok(store)
229 }
230
231 fn read_binary_entry(&mut self, path: &str) -> HwpxResult<Vec<u8>> {
235 let file = self
236 .archive
237 .by_name(path)
238 .map_err(|_| HwpxError::MissingFile { path: path.to_string() })?;
239
240 let hint = file.size().min(MAX_ENTRY_SIZE) as usize;
241 let mut limited = file.take(MAX_ENTRY_SIZE + 1);
242
243 let mut buf = Vec::with_capacity(hint);
244 std::io::Read::read_to_end(&mut limited, &mut buf)
245 .map_err(|e| HwpxError::Zip(format!("read '{}': {}", path, e)))?;
246
247 if buf.len() as u64 > MAX_ENTRY_SIZE {
248 return Err(HwpxError::InvalidStructure {
249 detail: format!(
250 "entry '{}' decompressed to {} bytes, exceeds limit of {}",
251 path,
252 buf.len(),
253 MAX_ENTRY_SIZE,
254 ),
255 });
256 }
257
258 self.total_read += buf.len() as u64;
259 if self.total_read > MAX_TOTAL_SIZE {
260 return Err(HwpxError::InvalidStructure {
261 detail: format!(
262 "total decompressed data ({} bytes) exceeds limit of {}",
263 self.total_read, MAX_TOTAL_SIZE,
264 ),
265 });
266 }
267
268 Ok(buf)
269 }
270
271 fn read_entry(&mut self, path: &str) -> HwpxResult<String> {
276 let file = self
277 .archive
278 .by_name(path)
279 .map_err(|_| HwpxError::MissingFile { path: path.to_string() })?;
280
281 let hint = file.size().min(MAX_ENTRY_SIZE) as usize;
285 let mut limited = file.take(MAX_ENTRY_SIZE + 1);
286
287 let mut buf = String::with_capacity(hint);
288 limited
289 .read_to_string(&mut buf)
290 .map_err(|e| HwpxError::Zip(format!("read '{}': {}", path, e)))?;
291
292 if buf.len() as u64 > MAX_ENTRY_SIZE {
293 return Err(HwpxError::InvalidStructure {
294 detail: format!(
295 "entry '{}' decompressed to {} bytes, exceeds limit of {}",
296 path,
297 buf.len(),
298 MAX_ENTRY_SIZE,
299 ),
300 });
301 }
302
303 self.total_read += buf.len() as u64;
305 if self.total_read > MAX_TOTAL_SIZE {
306 return Err(HwpxError::InvalidStructure {
307 detail: format!(
308 "total decompressed data ({} bytes) exceeds limit of {}",
309 self.total_read, MAX_TOTAL_SIZE,
310 ),
311 });
312 }
313
314 Ok(buf)
315 }
316}
317
318fn sanitize_bindata_key(name: &str) -> String {
323 name.split('/').filter(|c| !c.is_empty() && *c != "..").collect::<Vec<_>>().join("/")
324}
325
326impl std::fmt::Debug for PackageReader<'_> {
327 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
328 f.debug_struct("PackageReader")
329 .field("entries", &self.archive.len())
330 .field("sections", &self.section_count)
331 .field("total_read", &self.total_read)
332 .finish()
333 }
334}
335
336#[cfg(test)]
337mod tests {
338 use super::*;
339 use std::io::Write;
340 use zip::write::SimpleFileOptions;
341 use zip::ZipWriter;
342
343 fn make_hwpx_zip(mimetype: &str, header_xml: &str, sections: &[&str]) -> Vec<u8> {
345 let buf = Vec::new();
346 let mut zip = ZipWriter::new(Cursor::new(buf));
347 let opts = SimpleFileOptions::default();
348
349 let stored =
351 SimpleFileOptions::default().compression_method(zip::CompressionMethod::Stored);
352 zip.start_file("mimetype", stored).unwrap();
353 zip.write_all(mimetype.as_bytes()).unwrap();
354
355 zip.start_file("Contents/header.xml", opts).unwrap();
357 zip.write_all(header_xml.as_bytes()).unwrap();
358
359 for (i, content) in sections.iter().enumerate() {
361 let path = format!("Contents/section{}.xml", i);
362 zip.start_file(&path, opts).unwrap();
363 zip.write_all(content.as_bytes()).unwrap();
364 }
365
366 zip.finish().unwrap().into_inner()
367 }
368
369 const MINIMAL_HEADER: &str =
370 r#"<?xml version="1.0" encoding="UTF-8"?><head version="1.4" secCnt="1"></head>"#;
371
372 const MINIMAL_SECTION: &str = r#"<?xml version="1.0" encoding="UTF-8"?><sec></sec>"#;
373
374 #[test]
377 fn new_valid_hwpx() {
378 let bytes = make_hwpx_zip("application/hwp+zip", MINIMAL_HEADER, &[MINIMAL_SECTION]);
379 let reader = PackageReader::new(&bytes).unwrap();
380 assert_eq!(reader.section_count(), 1);
381 }
382
383 #[test]
384 fn new_alternative_mimetype() {
385 let bytes =
386 make_hwpx_zip("application/haansofthwp+zip", MINIMAL_HEADER, &[MINIMAL_SECTION]);
387 assert!(PackageReader::new(&bytes).is_ok());
388 }
389
390 #[test]
391 fn new_vnd_mimetype() {
392 let bytes =
393 make_hwpx_zip("application/vnd.hancom.hwp+zip", MINIMAL_HEADER, &[MINIMAL_SECTION]);
394 assert!(PackageReader::new(&bytes).is_ok());
395 }
396
397 #[test]
398 fn new_not_a_zip() {
399 let err = PackageReader::new(b"not a zip file").unwrap_err();
400 assert!(matches!(err, HwpxError::Zip(_)));
401 }
402
403 #[test]
404 fn new_wrong_mimetype() {
405 let bytes = make_hwpx_zip("application/pdf", MINIMAL_HEADER, &[MINIMAL_SECTION]);
406 let err = PackageReader::new(&bytes).unwrap_err();
407 match err {
408 HwpxError::InvalidMimetype { actual } => {
409 assert_eq!(actual, "application/pdf");
410 }
411 _ => panic!("expected InvalidMimetype, got: {err:?}"),
412 }
413 }
414
415 #[test]
416 fn new_empty_zip_missing_mimetype() {
417 let buf = Vec::new();
418 let zip = ZipWriter::new(Cursor::new(buf));
419 let bytes = zip.finish().unwrap().into_inner();
420 let err = PackageReader::new(&bytes).unwrap_err();
421 assert!(matches!(err, HwpxError::MissingFile { .. }));
422 }
423
424 #[test]
427 fn read_header_xml() {
428 let bytes = make_hwpx_zip("application/hwp+zip", MINIMAL_HEADER, &[MINIMAL_SECTION]);
429 let mut reader = PackageReader::new(&bytes).unwrap();
430 let xml = reader.read_header_xml().unwrap();
431 assert!(xml.contains("head"));
432 }
433
434 #[test]
435 fn read_section_xml_index_0() {
436 let bytes = make_hwpx_zip("application/hwp+zip", MINIMAL_HEADER, &[MINIMAL_SECTION]);
437 let mut reader = PackageReader::new(&bytes).unwrap();
438 let xml = reader.read_section_xml(0).unwrap();
439 assert!(xml.contains("sec"));
440 }
441
442 #[test]
443 fn read_section_xml_out_of_range() {
444 let bytes = make_hwpx_zip("application/hwp+zip", MINIMAL_HEADER, &[MINIMAL_SECTION]);
445 let mut reader = PackageReader::new(&bytes).unwrap();
446 let err = reader.read_section_xml(99).unwrap_err();
447 assert!(matches!(err, HwpxError::MissingFile { .. }));
448 }
449
450 #[test]
451 fn multiple_sections() {
452 let s0 = r#"<sec>section0</sec>"#;
453 let s1 = r#"<sec>section1</sec>"#;
454 let s2 = r#"<sec>section2</sec>"#;
455 let bytes = make_hwpx_zip("application/hwp+zip", MINIMAL_HEADER, &[s0, s1, s2]);
456 let mut reader = PackageReader::new(&bytes).unwrap();
457 assert_eq!(reader.section_count(), 3);
458 assert!(reader.read_section_xml(0).unwrap().contains("section0"));
459 assert!(reader.read_section_xml(1).unwrap().contains("section1"));
460 assert!(reader.read_section_xml(2).unwrap().contains("section2"));
461 }
462
463 #[test]
466 fn debug_impl() {
467 let bytes = make_hwpx_zip("application/hwp+zip", MINIMAL_HEADER, &[MINIMAL_SECTION]);
468 let reader = PackageReader::new(&bytes).unwrap();
469 let dbg = format!("{reader:?}");
470 assert!(dbg.contains("PackageReader"));
471 assert!(dbg.contains("sections: 1"));
472 }
473
474 #[test]
477 fn mimetype_with_trailing_whitespace() {
478 let bytes = make_hwpx_zip("application/hwp+zip \n", MINIMAL_HEADER, &[MINIMAL_SECTION]);
479 assert!(PackageReader::new(&bytes).is_ok());
480 }
481
482 #[test]
485 fn sanitize_bindata_key_strips_traversal() {
486 assert_eq!(sanitize_bindata_key("../../../etc/passwd"), "etc/passwd");
487 assert_eq!(sanitize_bindata_key("BinData/../secret"), "BinData/secret");
488 assert_eq!(sanitize_bindata_key("image.png"), "image.png");
489 assert_eq!(sanitize_bindata_key(".."), "");
490 assert_eq!(sanitize_bindata_key("a/../../b"), "a/b");
491 assert_eq!(sanitize_bindata_key("//double//slash"), "double/slash");
492 }
493}