use std::io::{Cursor, Read};
use zip::ZipArchive;
use crate::error::{HwpxError, HwpxResult};
const MAX_ENTRY_SIZE: u64 = 50 * 1024 * 1024;
const MAX_TOTAL_SIZE: u64 = 500 * 1024 * 1024;
const MAX_ENTRIES: usize = 10_000;
const ACCEPTED_MIMETYPES: &[&str] =
&["application/hwp+zip", "application/haansofthwp+zip", "application/vnd.hancom.hwp+zip"];
const MIMETYPE_PATH: &str = "mimetype";
const HEADER_PATH: &str = "Contents/header.xml";
const SECTION_PREFIX: &str = "Contents/section";
const SECTION_SUFFIX: &str = ".xml";
pub struct PackageReader<'a> {
archive: ZipArchive<Cursor<&'a [u8]>>,
section_count: usize,
total_read: u64,
}
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct PackageEntryInfo {
pub path: String,
pub size: u64,
pub compressed_size: u64,
}
impl<'a> PackageReader<'a> {
pub fn new(bytes: &'a [u8]) -> HwpxResult<Self> {
let cursor = Cursor::new(bytes);
let archive = ZipArchive::new(cursor).map_err(|e| HwpxError::Zip(e.to_string()))?;
if archive.len() > MAX_ENTRIES {
return Err(HwpxError::InvalidStructure {
detail: format!(
"archive has {} entries, exceeds limit of {}",
archive.len(),
MAX_ENTRIES,
),
});
}
let section_count = archive
.file_names()
.filter(|name| name.starts_with(SECTION_PREFIX) && name.ends_with(SECTION_SUFFIX))
.count();
let mut reader = Self { archive, section_count, total_read: 0 };
reader.validate_mimetype()?;
Ok(reader)
}
fn validate_mimetype(&mut self) -> HwpxResult<()> {
let content = self.read_entry(MIMETYPE_PATH)?;
let trimmed = content.trim();
if !ACCEPTED_MIMETYPES.contains(&trimmed) {
return Err(HwpxError::InvalidMimetype { actual: trimmed.to_string() });
}
Ok(())
}
pub fn read_header_xml(&mut self) -> HwpxResult<String> {
self.read_entry(HEADER_PATH)
}
pub fn read_section_xml(&mut self, index: usize) -> HwpxResult<String> {
let path = format!("{}{}{}", SECTION_PREFIX, index, SECTION_SUFFIX);
self.read_entry(&path)
}
pub fn section_count(&self) -> usize {
self.section_count
}
pub fn list_entries(&mut self) -> HwpxResult<Vec<PackageEntryInfo>> {
let mut entries = Vec::with_capacity(self.archive.len());
for index in 0..self.archive.len() {
let file = self.archive.by_index(index).map_err(|e| HwpxError::Zip(e.to_string()))?;
entries.push(PackageEntryInfo {
path: file.name().to_string(),
size: file.size(),
compressed_size: file.compressed_size(),
});
}
Ok(entries)
}
pub fn read_text_entry(&mut self, path: &str) -> HwpxResult<String> {
self.read_entry(path)
}
pub fn read_masterpage_xmls(&mut self) -> HwpxResult<std::collections::HashMap<usize, String>> {
let mp_paths: Vec<(usize, String)> = self
.archive
.file_names()
.filter_map(|name| {
let stripped = name.strip_prefix("Contents/masterpage")?;
let idx_str = stripped.strip_suffix(".xml")?;
let idx: usize = idx_str.parse().ok()?;
Some((idx, name.to_string()))
})
.collect();
let mut result = std::collections::HashMap::new();
for (idx, path) in mp_paths {
let xml = self.read_entry(&path)?;
result.insert(idx, xml);
}
Ok(result)
}
pub fn read_chart_xmls(&mut self) -> HwpxResult<std::collections::HashMap<String, String>> {
let chart_paths: Vec<String> = self
.archive
.file_names()
.filter(|name| name.starts_with("Chart/") && name.ends_with(".xml"))
.map(|s| s.to_string())
.collect();
let mut map = std::collections::HashMap::new();
for path in chart_paths {
let xml = self.read_entry(&path)?;
map.insert(path, xml);
}
Ok(map)
}
pub fn read_all_bindata(&mut self) -> HwpxResult<hwpforge_core::image::ImageStore> {
let bindata_paths: Vec<String> = self
.archive
.file_names()
.filter(|name| name.starts_with("BinData/") && name.len() > "BinData/".len())
.map(|s| s.to_string())
.collect();
let mut store = hwpforge_core::image::ImageStore::new();
for path in bindata_paths {
let data = self.read_binary_entry(&path)?;
let raw_key = path.strip_prefix("BinData/").unwrap_or(&path);
let key = sanitize_bindata_key(raw_key);
if !key.is_empty() {
store.insert(&key, data);
}
}
Ok(store)
}
fn read_binary_entry(&mut self, path: &str) -> HwpxResult<Vec<u8>> {
let file = self
.archive
.by_name(path)
.map_err(|_| HwpxError::MissingFile { path: path.to_string() })?;
let hint = file.size().min(MAX_ENTRY_SIZE) as usize;
let mut limited = file.take(MAX_ENTRY_SIZE + 1);
let mut buf = Vec::with_capacity(hint);
std::io::Read::read_to_end(&mut limited, &mut buf)
.map_err(|e| HwpxError::Zip(format!("read '{}': {}", path, e)))?;
if buf.len() as u64 > MAX_ENTRY_SIZE {
return Err(HwpxError::InvalidStructure {
detail: format!(
"entry '{}' decompressed to {} bytes, exceeds limit of {}",
path,
buf.len(),
MAX_ENTRY_SIZE,
),
});
}
self.total_read += buf.len() as u64;
if self.total_read > MAX_TOTAL_SIZE {
return Err(HwpxError::InvalidStructure {
detail: format!(
"total decompressed data ({} bytes) exceeds limit of {}",
self.total_read, MAX_TOTAL_SIZE,
),
});
}
Ok(buf)
}
fn read_entry(&mut self, path: &str) -> HwpxResult<String> {
let file = self
.archive
.by_name(path)
.map_err(|_| HwpxError::MissingFile { path: path.to_string() })?;
let hint = file.size().min(MAX_ENTRY_SIZE) as usize;
let mut limited = file.take(MAX_ENTRY_SIZE + 1);
let mut buf = String::with_capacity(hint);
limited
.read_to_string(&mut buf)
.map_err(|e| HwpxError::Zip(format!("read '{}': {}", path, e)))?;
if buf.len() as u64 > MAX_ENTRY_SIZE {
return Err(HwpxError::InvalidStructure {
detail: format!(
"entry '{}' decompressed to {} bytes, exceeds limit of {}",
path,
buf.len(),
MAX_ENTRY_SIZE,
),
});
}
self.total_read += buf.len() as u64;
if self.total_read > MAX_TOTAL_SIZE {
return Err(HwpxError::InvalidStructure {
detail: format!(
"total decompressed data ({} bytes) exceeds limit of {}",
self.total_read, MAX_TOTAL_SIZE,
),
});
}
Ok(buf)
}
}
fn sanitize_bindata_key(name: &str) -> String {
name.split('/').filter(|c| !c.is_empty() && *c != "..").collect::<Vec<_>>().join("/")
}
impl std::fmt::Debug for PackageReader<'_> {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
f.debug_struct("PackageReader")
.field("entries", &self.archive.len())
.field("sections", &self.section_count)
.field("total_read", &self.total_read)
.finish()
}
}
#[cfg(test)]
mod tests {
use super::*;
use std::io::Write;
use zip::write::SimpleFileOptions;
use zip::ZipWriter;
fn make_hwpx_zip(mimetype: &str, header_xml: &str, sections: &[&str]) -> Vec<u8> {
let buf = Vec::new();
let mut zip = ZipWriter::new(Cursor::new(buf));
let opts = SimpleFileOptions::default();
let stored =
SimpleFileOptions::default().compression_method(zip::CompressionMethod::Stored);
zip.start_file("mimetype", stored).unwrap();
zip.write_all(mimetype.as_bytes()).unwrap();
zip.start_file("Contents/header.xml", opts).unwrap();
zip.write_all(header_xml.as_bytes()).unwrap();
for (i, content) in sections.iter().enumerate() {
let path = format!("Contents/section{}.xml", i);
zip.start_file(&path, opts).unwrap();
zip.write_all(content.as_bytes()).unwrap();
}
zip.finish().unwrap().into_inner()
}
const MINIMAL_HEADER: &str =
r#"<?xml version="1.0" encoding="UTF-8"?><head version="1.4" secCnt="1"></head>"#;
const MINIMAL_SECTION: &str = r#"<?xml version="1.0" encoding="UTF-8"?><sec></sec>"#;
#[test]
fn new_valid_hwpx() {
let bytes = make_hwpx_zip("application/hwp+zip", MINIMAL_HEADER, &[MINIMAL_SECTION]);
let reader = PackageReader::new(&bytes).unwrap();
assert_eq!(reader.section_count(), 1);
}
#[test]
fn new_alternative_mimetype() {
let bytes =
make_hwpx_zip("application/haansofthwp+zip", MINIMAL_HEADER, &[MINIMAL_SECTION]);
assert!(PackageReader::new(&bytes).is_ok());
}
#[test]
fn new_vnd_mimetype() {
let bytes =
make_hwpx_zip("application/vnd.hancom.hwp+zip", MINIMAL_HEADER, &[MINIMAL_SECTION]);
assert!(PackageReader::new(&bytes).is_ok());
}
#[test]
fn new_not_a_zip() {
let err = PackageReader::new(b"not a zip file").unwrap_err();
assert!(matches!(err, HwpxError::Zip(_)));
}
#[test]
fn new_wrong_mimetype() {
let bytes = make_hwpx_zip("application/pdf", MINIMAL_HEADER, &[MINIMAL_SECTION]);
let err = PackageReader::new(&bytes).unwrap_err();
match err {
HwpxError::InvalidMimetype { actual } => {
assert_eq!(actual, "application/pdf");
}
_ => panic!("expected InvalidMimetype, got: {err:?}"),
}
}
#[test]
fn new_empty_zip_missing_mimetype() {
let buf = Vec::new();
let zip = ZipWriter::new(Cursor::new(buf));
let bytes = zip.finish().unwrap().into_inner();
let err = PackageReader::new(&bytes).unwrap_err();
assert!(matches!(err, HwpxError::MissingFile { .. }));
}
#[test]
fn read_header_xml() {
let bytes = make_hwpx_zip("application/hwp+zip", MINIMAL_HEADER, &[MINIMAL_SECTION]);
let mut reader = PackageReader::new(&bytes).unwrap();
let xml = reader.read_header_xml().unwrap();
assert!(xml.contains("head"));
}
#[test]
fn read_section_xml_index_0() {
let bytes = make_hwpx_zip("application/hwp+zip", MINIMAL_HEADER, &[MINIMAL_SECTION]);
let mut reader = PackageReader::new(&bytes).unwrap();
let xml = reader.read_section_xml(0).unwrap();
assert!(xml.contains("sec"));
}
#[test]
fn read_section_xml_out_of_range() {
let bytes = make_hwpx_zip("application/hwp+zip", MINIMAL_HEADER, &[MINIMAL_SECTION]);
let mut reader = PackageReader::new(&bytes).unwrap();
let err = reader.read_section_xml(99).unwrap_err();
assert!(matches!(err, HwpxError::MissingFile { .. }));
}
#[test]
fn multiple_sections() {
let s0 = r#"<sec>section0</sec>"#;
let s1 = r#"<sec>section1</sec>"#;
let s2 = r#"<sec>section2</sec>"#;
let bytes = make_hwpx_zip("application/hwp+zip", MINIMAL_HEADER, &[s0, s1, s2]);
let mut reader = PackageReader::new(&bytes).unwrap();
assert_eq!(reader.section_count(), 3);
assert!(reader.read_section_xml(0).unwrap().contains("section0"));
assert!(reader.read_section_xml(1).unwrap().contains("section1"));
assert!(reader.read_section_xml(2).unwrap().contains("section2"));
}
#[test]
fn debug_impl() {
let bytes = make_hwpx_zip("application/hwp+zip", MINIMAL_HEADER, &[MINIMAL_SECTION]);
let reader = PackageReader::new(&bytes).unwrap();
let dbg = format!("{reader:?}");
assert!(dbg.contains("PackageReader"));
assert!(dbg.contains("sections: 1"));
}
#[test]
fn mimetype_with_trailing_whitespace() {
let bytes = make_hwpx_zip("application/hwp+zip \n", MINIMAL_HEADER, &[MINIMAL_SECTION]);
assert!(PackageReader::new(&bytes).is_ok());
}
#[test]
fn sanitize_bindata_key_strips_traversal() {
assert_eq!(sanitize_bindata_key("../../../etc/passwd"), "etc/passwd");
assert_eq!(sanitize_bindata_key("BinData/../secret"), "BinData/secret");
assert_eq!(sanitize_bindata_key("image.png"), "image.png");
assert_eq!(sanitize_bindata_key(".."), "");
assert_eq!(sanitize_bindata_key("a/../../b"), "a/b");
assert_eq!(sanitize_bindata_key("//double//slash"), "double/slash");
}
}