jw-hwp-core 0.1.1

Read-only parser for Hancom HWP 5.0 (binary CFB) and HWPX (OWPML) documents
Documentation
//! Parses `Contents/content.hpf` (OPF-style package manifest) into a
//! `Metadata` struct and a sorted list of section file paths.

use crate::error::Error;
use crate::summary::Metadata;
use quick_xml::events::Event;
use quick_xml::Reader;

#[derive(Debug, Default)]
pub struct Manifest {
    pub metadata: Metadata,
    pub section_files: Vec<String>,
    pub assets: crate::assets::AssetCatalog,
}

pub fn parse(bytes: &[u8]) -> Result<Manifest, Error> {
    let mut reader = Reader::from_reader(bytes);
    reader.config_mut().trim_text(true);
    let mut out = Manifest::default();

    // Tracks nesting for element text capture.
    let mut in_title = false;
    // Currently-open <opf:meta name="X" ...> — hold name until text appears.
    let mut current_meta_name: Option<String> = None;

    loop {
        match reader
            .read_event()
            .map_err(|e| Error::Container(format!("content.hpf xml: {e}")))?
        {
            Event::Start(e) => {
                let name = e.name();
                let local = name.local_name();
                let local_bytes = local.as_ref();
                match local_bytes {
                    b"title" => in_title = true,
                    b"meta" => {
                        current_meta_name = get_attr(&e, b"name");
                    }
                    b"item" => {
                        if let Some(href) = get_attr(&e, b"href") {
                            if href.starts_with("Contents/section") && href.ends_with(".xml") {
                                out.section_files.push(href);
                            }
                        }
                    }
                    _ => {}
                }
            }
            Event::Empty(e) => {
                let name = e.name();
                let local = name.local_name();
                let local_bytes = local.as_ref();
                match local_bytes {
                    b"item" => {
                        if let Some(href) = get_attr(&e, b"href") {
                            if href.starts_with("Contents/section") && href.ends_with(".xml") {
                                out.section_files.push(href.clone());
                            }
                            let media_type = get_attr(&e, b"media-type").unwrap_or_default();
                            if href.starts_with("BinData/") || media_type.starts_with("image/") {
                                let id = parse_bin_id(&href)
                                    .unwrap_or((out.assets.entries.len() + 1) as u16);
                                let format =
                                    href.rsplit('.').next().unwrap_or("").to_ascii_lowercase();
                                out.assets.entries.push(crate::assets::BinDataEntry {
                                    id,
                                    name: href,
                                    kind: "EMBEDDING".into(),
                                    format,
                                    size_bytes: None,
                                });
                            }
                        }
                    }
                    b"meta" => {
                        // self-closing meta with no text — ignore
                        current_meta_name = None;
                    }
                    b"title" => {
                        // empty title
                    }
                    _ => {}
                }
            }
            Event::Text(t) => {
                let s = t
                    .unescape()
                    .map_err(|e| Error::Container(format!("content.hpf text: {e}")))?
                    .into_owned();
                if s.is_empty() {
                    continue;
                }
                if in_title {
                    out.metadata.title = Some(s);
                } else if let Some(meta_name) = &current_meta_name {
                    match meta_name.as_str() {
                        "creator" => out.metadata.author = Some(s),
                        "lastsaveby" => out.metadata.last_author = Some(s),
                        "CreatedDate" => out.metadata.created_at = Some(s),
                        "ModifiedDate" => out.metadata.modified_at = Some(s),
                        "subject" => out.metadata.subject = Some(s),
                        "keyword" | "keywords" => out.metadata.keywords = Some(s),
                        "description" => out.metadata.comments = Some(s),
                        _ => {}
                    }
                }
            }
            Event::End(e) => {
                let name = e.name();
                let local = name.local_name();
                match local.as_ref() {
                    b"title" => in_title = false,
                    b"meta" => current_meta_name = None,
                    _ => {}
                }
            }
            Event::Eof => break,
            _ => {}
        }
    }

    out.section_files.sort();
    Ok(out)
}

fn get_attr(e: &quick_xml::events::BytesStart, key: &[u8]) -> Option<String> {
    for a in e.attributes().flatten() {
        if a.key.as_ref() == key || a.key.local_name().as_ref() == key {
            if let Ok(v) = a.unescape_value() {
                return Some(v.into_owned());
            }
        }
    }
    None
}

/// Extract digits from hrefs like "BinData/image1.png" or "BinData/BIN0001.jpg" → 1.
/// Returns None if no digits present.
fn parse_bin_id(href: &str) -> Option<u16> {
    let digits: String = href.chars().skip_while(|c| !c.is_ascii_digit()).collect();
    let trimmed: String = digits.chars().take_while(|c| c.is_ascii_digit()).collect();
    if trimmed.is_empty() {
        return None;
    }
    trimmed.parse::<u16>().ok()
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn extracts_bindata_items() {
        let xml = r#"<?xml version="1.0" encoding="UTF-8"?>
<opf:package xmlns:opf="http://www.idpf.org/2007/opf/">
  <opf:manifest>
    <opf:item id="header" href="Contents/header.xml" media-type="application/xml"/>
    <opf:item id="section0" href="Contents/section0.xml" media-type="application/xml"/>
    <opf:item id="img1" href="BinData/image1.png" media-type="image/png"/>
    <opf:item id="img2" href="BinData/image2.jpg" media-type="image/jpeg"/>
  </opf:manifest>
</opf:package>"#;
        let m = parse(xml.as_bytes()).expect("parse");
        assert_eq!(m.assets.entries.len(), 2);
        assert_eq!(m.assets.entries[0].id, 1);
        assert_eq!(m.assets.entries[0].name, "BinData/image1.png");
        assert_eq!(m.assets.entries[0].format, "png");
        assert_eq!(m.assets.entries[0].kind, "EMBEDDING");
        assert_eq!(m.assets.entries[1].id, 2);
        assert_eq!(m.assets.entries[1].format, "jpg");
    }

    #[test]
    fn parse_bin_id_variants() {
        assert_eq!(parse_bin_id("BinData/image1.png"), Some(1));
        assert_eq!(parse_bin_id("BinData/BIN0042.jpg"), Some(42));
        assert_eq!(parse_bin_id("BinData/logo.png"), None);
    }
}