mod container;
mod header;
mod section;
mod styles;
pub use container::HwpxContainer;
use crate::error::Result;
use crate::model::Document;
use crate::streaming::{ParseEvent, SectionStreamOptions};
use quick_xml::events::Event;
use quick_xml::Reader;
#[cfg(not(target_arch = "wasm32"))]
use rayon::prelude::*;
use std::io::{Read, Seek};
use std::ops::ControlFlow;
use std::path::Path;
pub mod ns {
pub const HP: &str = "http://www.hancom.co.kr/hwpml/2011/paragraph";
pub const HC: &str = "http://www.hancom.co.kr/hwpml/2011/core";
pub const HH: &str = "http://www.hancom.co.kr/hwpml/2011/head";
pub const HM: &str = "http://www.hancom.co.kr/hwpml/2011/master";
}
pub struct HwpxParser {
container: HwpxContainer,
}
impl HwpxParser {
pub fn open(path: impl AsRef<Path>) -> Result<Self> {
let container = HwpxContainer::open(path)?;
Ok(Self { container })
}
pub fn from_reader<R: Read + Seek>(reader: R) -> Result<Self> {
let container = HwpxContainer::from_reader(reader)?;
Ok(Self { container })
}
pub fn parse(&mut self) -> Result<Document> {
self.parse_with_options(&crate::ParseOptions::default())
}
pub fn parse_with_options(&mut self, opts: &crate::ParseOptions) -> Result<Document> {
let mut document = Document::new();
document.metadata.format_version = Some("HWPX".to_string());
self.parse_metadata(&mut document)?;
self.parse_header_options(&mut document)?;
self.parse_styles(&mut document)?;
self.parse_sections(&mut document)?;
if opts.extract_resources {
self.extract_resources(&mut document)?;
}
Ok(document)
}
pub fn for_each_section<F>(&mut self, opts: SectionStreamOptions, mut f: F) -> Result<()>
where
F: FnMut(ParseEvent<'_>) -> ControlFlow<()>,
{
let (metadata, styles) = {
let mut tmp = Document::new();
tmp.metadata.format_version = Some("HWPX".to_string());
self.parse_metadata(&mut tmp)?;
self.parse_header_options(&mut tmp)?;
self.parse_styles(&mut tmp)?;
(tmp.metadata, tmp.styles)
};
let section_files = self.container.list_sections()?;
let section_count = section_files.len();
let image_map = self.container.build_image_map();
if f(ParseEvent::DocumentStart {
metadata: &metadata,
styles: &styles,
section_count,
image_map,
}) == ControlFlow::Break(())
{
return Ok(());
}
for (index, path) in section_files.iter().enumerate() {
match self.container.read_file(path) {
Err(e) if opts.error_mode == crate::parse_options::ErrorMode::Lenient => {
if f(ParseEvent::SectionFailed { index, error: e }) == ControlFlow::Break(()) {
return Ok(());
}
}
Err(e) => return Err(e),
Ok(xml) => match section::parse_section(&xml, index, &styles) {
Err(e) if opts.error_mode == crate::parse_options::ErrorMode::Lenient => {
if f(ParseEvent::SectionFailed { index, error: e })
== ControlFlow::Break(())
{
return Ok(());
}
}
Err(e) => return Err(e),
Ok(sec) => {
if f(ParseEvent::SectionParsed(&sec)) == ControlFlow::Break(()) {
return Ok(());
}
}
},
}
}
if f(ParseEvent::DocumentEnd) == ControlFlow::Break(()) {
return Ok(());
}
if opts.extract_resources {
if let Ok(resources) = self.container.list_bindata() {
for resource_path in resources {
if let Ok(data) = self.container.read_binary(&resource_path) {
let name = resource_path
.rsplit('/')
.next()
.unwrap_or(&resource_path)
.to_string();
if f(ParseEvent::ResourceExtracted { name, data }) == ControlFlow::Break(())
{
return Ok(());
}
}
}
}
}
Ok(())
}
fn parse_metadata(&mut self, document: &mut Document) -> Result<()> {
let content_hpf = self.container.read_content_hpf()?;
if let Some(title) = extract_metadata_field(&content_hpf, "title") {
document.metadata.title = Some(title);
}
if let Some(author) = extract_metadata_field(&content_hpf, "creator") {
document.metadata.author = Some(author);
}
if let Some(subject) = extract_metadata_field(&content_hpf, "description") {
document.metadata.subject = Some(subject);
}
if let Some(date) = extract_metadata_field(&content_hpf, "date") {
document.metadata.created = Some(date);
}
if let Some(modified) = extract_metadata_field(&content_hpf, "modified") {
document.metadata.modified = Some(modified);
}
let keywords = extract_keywords(&content_hpf);
if !keywords.is_empty() {
document.metadata.keywords = keywords;
}
if let Some(generator) = extract_metadata_field(&content_hpf, "generator") {
document.metadata.creator_app = Some(generator);
}
Ok(())
}
fn parse_styles(&mut self, document: &mut Document) -> Result<()> {
if let Ok(styles_xml) = self.container.read_file("Contents/header.xml") {
styles::parse_styles(&styles_xml, &mut document.styles)?;
}
Ok(())
}
fn parse_header_options(&mut self, document: &mut Document) -> Result<()> {
if let Ok(header_xml) = self.container.read_file("Contents/header.xml") {
let is_distribution = header::parse_header(&header_xml)?;
document.metadata.is_distribution = is_distribution;
}
Ok(())
}
fn parse_sections(&mut self, document: &mut Document) -> Result<()> {
let section_files = self.container.list_sections()?;
let section_data: Vec<(usize, String)> = section_files
.iter()
.enumerate()
.filter_map(|(index, path)| self.container.read_file(path).ok().map(|xml| (index, xml)))
.collect();
let styles = document.styles.clone();
#[cfg(not(target_arch = "wasm32"))]
const PARALLEL_THRESHOLD: usize = 3;
#[cfg(not(target_arch = "wasm32"))]
let mut sections: Vec<_> = if section_data.len() >= PARALLEL_THRESHOLD {
section_data
.par_iter()
.filter_map(|(index, xml)| section::parse_section(xml, *index, &styles).ok())
.collect()
} else {
section_data
.iter()
.filter_map(|(index, xml)| section::parse_section(xml, *index, &styles).ok())
.collect()
};
#[cfg(target_arch = "wasm32")]
let mut sections: Vec<_> = section_data
.iter()
.filter_map(|(index, xml)| section::parse_section(xml, *index, &styles).ok())
.collect();
sections.sort_by_key(|s| s.index);
document.sections = sections;
Ok(())
}
fn extract_resources(&mut self, document: &mut Document) -> Result<()> {
let resources = self.container.list_bindata()?;
for resource_path in resources {
if let Ok(data) = self.container.read_binary(&resource_path) {
let filename = resource_path
.rsplit('/')
.next()
.unwrap_or(&resource_path)
.to_string();
let mime_type = guess_mime_type(&filename);
let size = data.len();
let resource = crate::model::Resource {
resource_type: crate::model::ResourceType::Image,
filename: Some(filename.clone()),
mime_type,
data,
size,
};
document.resources.insert(filename, resource);
}
}
Ok(())
}
}
#[derive(Default)]
struct MetadataResult {
title: Option<String>,
creator: Option<String>,
description: Option<String>,
date: Option<String>,
modified: Option<String>,
generator: Option<String>,
keywords: Vec<String>,
}
fn parse_metadata_xml(xml: &str) -> MetadataResult {
let mut result = MetadataResult::default();
let mut reader = Reader::from_str(xml);
reader.config_mut().trim_text(true);
let mut buf = Vec::new();
let mut current_element: Option<String> = None;
let mut current_meta_name: Option<String> = None;
loop {
match reader.read_event_into(&mut buf) {
Ok(Event::Start(e)) | Ok(Event::Empty(e)) => {
let name = get_local_name(&e);
match name.as_str() {
"title" | "creator" | "description" | "date" | "modified" | "generator"
| "subject" | "keywords" => {
current_element = Some(name);
}
"meta" => {
for attr in e.attributes().flatten() {
if attr.key.local_name().as_ref() == b"name" {
if let Ok(value) = attr.unescape_value() {
current_meta_name = Some(value.to_string());
}
}
}
}
_ => {
current_element = None;
}
}
}
Ok(Event::Text(e)) => {
if let Ok(text) = e.unescape() {
let text = text.trim().to_string();
if !text.is_empty() {
if let Some(ref elem) = current_element {
match elem.as_str() {
"title" => result.title = Some(text.clone()),
"creator" => result.creator = Some(text.clone()),
"description" => result.description = Some(text.clone()),
"date" => result.date = Some(text.clone()),
"modified" => result.modified = Some(text.clone()),
"generator" => result.generator = Some(text.clone()),
"subject" | "keywords" => {
for kw in text.split([',', ';', '|']) {
let kw = kw.trim();
if !kw.is_empty()
&& !result.keywords.contains(&kw.to_string())
{
result.keywords.push(kw.to_string());
}
}
}
_ => {}
}
}
else if let Some(ref meta_name) = current_meta_name {
match meta_name.as_str() {
"title" => result.title = Some(text.clone()),
"creator" => result.creator = Some(text.clone()),
"description" => result.description = Some(text.clone()),
"date" => result.date = Some(text.clone()),
"modified" => result.modified = Some(text.clone()),
"generator" => result.generator = Some(text.clone()),
"subject" | "keywords" => {
for kw in text.split([',', ';', '|']) {
let kw = kw.trim();
if !kw.is_empty()
&& !result.keywords.contains(&kw.to_string())
{
result.keywords.push(kw.to_string());
}
}
}
_ => {}
}
}
}
}
}
Ok(Event::End(_)) => {
current_element = None;
current_meta_name = None;
}
Ok(Event::Eof) => break,
Err(_) => break,
_ => {}
}
buf.clear();
}
result
}
fn extract_metadata_field(xml: &str, field: &str) -> Option<String> {
let metadata = parse_metadata_xml(xml);
match field {
"title" => metadata.title,
"creator" => metadata.creator,
"description" => metadata.description,
"date" => metadata.date,
"modified" => metadata.modified,
"generator" => metadata.generator,
"subject" | "keywords" => {
if metadata.keywords.is_empty() {
None
} else {
Some(metadata.keywords.join(", "))
}
}
_ => None,
}
}
fn extract_keywords(xml: &str) -> Vec<String> {
parse_metadata_xml(xml).keywords
}
fn get_local_name(e: &quick_xml::events::BytesStart) -> String {
let name = e.name();
let local = name.local_name();
String::from_utf8_lossy(local.as_ref()).to_string()
}
fn guess_mime_type(filename: &str) -> Option<String> {
let ext = filename.rsplit('.').next()?.to_lowercase();
match ext.as_str() {
"png" => Some("image/png".to_string()),
"jpg" | "jpeg" => Some("image/jpeg".to_string()),
"gif" => Some("image/gif".to_string()),
"bmp" => Some("image/bmp".to_string()),
"webp" => Some("image/webp".to_string()),
"svg" => Some("image/svg+xml".to_string()),
"wmf" => Some("image/x-wmf".to_string()),
"emf" => Some("image/x-emf".to_string()),
_ => None,
}
}