jw-hwp-core 0.1.1

Read-only parser for Hancom HWP 5.0 (binary CFB) and HWPX (OWPML) documents
Documentation
//! Parses `Contents/header.xml` into `ShapeTables` (char_shapes, para_shapes).

use crate::error::Error;
use crate::shape::{Align, CharShape, ParaShape, ShapeTables};
use quick_xml::events::{BytesStart, Event};
use quick_xml::Reader;

struct CharPrState {
    id: u32,
    size_pt: f32,
    color: u32,
    italic: bool,
    bold: bool,
    underline: bool,
    strikethrough: bool,
}

struct ParaPrState {
    id: u32,
    align: Align,
    left_margin: i32,
    right_margin: i32,
    indent: i32,
}

pub fn parse(bytes: &[u8]) -> Result<ShapeTables, Error> {
    let mut reader = Reader::from_reader(bytes);
    reader.config_mut().trim_text(true);
    let mut tables = ShapeTables::default();
    let mut cur_char: Option<CharPrState> = None;
    let mut cur_para: Option<ParaPrState> = None;
    // Track only the first language group's fonts for the flat face list; this
    // matches the typical HANGUL slot-0 convention in HWP binary.
    let mut in_first_fontface = false;
    let mut seen_fontface = false;

    loop {
        match reader
            .read_event()
            .map_err(|e| Error::Container(format!("header.xml: {e}")))?
        {
            Event::Start(e) => {
                let local_owned = e.name().local_name().as_ref().to_vec();
                let local = local_owned.as_slice();
                match local {
                    b"charPr" => cur_char = Some(start_char(&e)),
                    b"paraPr" => cur_para = Some(start_para(&e)),
                    b"fontface" => {
                        if !seen_fontface {
                            in_first_fontface = true;
                            seen_fontface = true;
                        }
                    }
                    b"font" => {
                        if in_first_fontface {
                            if let Some(face) = get_attr(&e, b"face") {
                                if !face.is_empty() {
                                    tables.faces.push(crate::faces::FaceName {
                                        name: face,
                                        substitute: None,
                                        base: None,
                                    });
                                }
                            }
                        }
                    }
                    b"style" => push_style(&mut tables, &e),
                    _ => handle_child(&e, cur_char.as_mut(), cur_para.as_mut()),
                }
            }
            Event::Empty(e) => {
                let local_owned = e.name().local_name().as_ref().to_vec();
                let local = local_owned.as_slice();
                match local {
                    b"charPr" => {
                        let s = start_char(&e);
                        insert_char(&mut tables, s);
                    }
                    b"paraPr" => {
                        let s = start_para(&e);
                        insert_para(&mut tables, s);
                    }
                    b"font" => {
                        if in_first_fontface {
                            if let Some(face) = get_attr(&e, b"face") {
                                if !face.is_empty() {
                                    tables.faces.push(crate::faces::FaceName {
                                        name: face,
                                        substitute: None,
                                        base: None,
                                    });
                                }
                            }
                        }
                    }
                    b"style" => push_style(&mut tables, &e),
                    _ => handle_child(&e, cur_char.as_mut(), cur_para.as_mut()),
                }
            }
            Event::End(e) => {
                let local_owned = e.name().local_name().as_ref().to_vec();
                let local = local_owned.as_slice();
                match local {
                    b"charPr" => {
                        if let Some(s) = cur_char.take() {
                            insert_char(&mut tables, s);
                        }
                    }
                    b"paraPr" => {
                        if let Some(s) = cur_para.take() {
                            insert_para(&mut tables, s);
                        }
                    }
                    b"fontface" => {
                        in_first_fontface = false;
                    }
                    _ => {}
                }
            }
            Event::Eof => break,
            _ => {}
        }
    }
    // Populate CharShape.face_names from the flat face list; face_ids for HWPX
    // are always [0;7], so face_names[slot] resolves to faces[0].name for all
    // slots — acceptable given HWPX doesn't encode per-slot face refs here.
    for cs in tables.char_shapes.values_mut() {
        for i in 0..7 {
            cs.face_names[i] = tables
                .faces
                .get(cs.face_ids[i] as usize)
                .map(|f| f.name.clone())
                .unwrap_or_default();
        }
    }
    Ok(tables)
}

fn push_style(tables: &mut ShapeTables, e: &BytesStart) {
    let local = get_attr(e, b"name").unwrap_or_default();
    let english = get_attr(e, b"engName").unwrap_or_default();
    let para_shape_id = get_attr_u32(e, b"paraPrIDRef").unwrap_or(0) as u16;
    let char_shape_id = get_attr_u32(e, b"charPrIDRef").unwrap_or(0) as u16;
    tables.styles.push(crate::styles::Style {
        local_name: local,
        english_name: english,
        para_shape_id,
        char_shape_id,
    });
}

fn start_char(e: &BytesStart) -> CharPrState {
    let id = get_attr_u32(e, b"id").unwrap_or(0);
    let height = get_attr_u32(e, b"height").unwrap_or(1000);
    let color = get_attr(e, b"textColor")
        .and_then(|s| parse_hex_color(&s))
        .unwrap_or(0);
    CharPrState {
        id,
        size_pt: height as f32 / 100.0,
        color,
        italic: false,
        bold: false,
        underline: false,
        strikethrough: false,
    }
}

fn start_para(e: &BytesStart) -> ParaPrState {
    let id = get_attr_u32(e, b"id").unwrap_or(0);
    ParaPrState {
        id,
        align: Align::Left,
        left_margin: 0,
        right_margin: 0,
        indent: 0,
    }
}

fn insert_char(tables: &mut ShapeTables, s: CharPrState) {
    tables.char_shapes.insert(
        s.id,
        CharShape {
            face_ids: [0; 7],
            face_names: Default::default(),
            size_pt: s.size_pt,
            italic: s.italic,
            bold: s.bold,
            underline: s.underline,
            strikethrough: s.strikethrough,
            superscript: false,
            subscript: false,
            color: s.color,
        },
    );
}

fn insert_para(tables: &mut ShapeTables, s: ParaPrState) {
    tables.para_shapes.insert(
        s.id,
        ParaShape {
            align: s.align,
            left_margin: s.left_margin,
            right_margin: s.right_margin,
            indent: s.indent,
            space_before: 0,
            space_after: 0,
            line_spacing: 0,
        },
    );
}

fn handle_child(
    e: &BytesStart,
    cur_char: Option<&mut CharPrState>,
    cur_para: Option<&mut ParaPrState>,
) {
    let local_owned = e.name().local_name().as_ref().to_vec();
    let local = local_owned.as_slice();
    if let Some(c) = cur_char {
        match local {
            b"bold" => c.bold = true,
            b"italic" => c.italic = true,
            b"underline" => {
                let t = get_attr(e, b"type").unwrap_or_else(|| "NONE".into());
                if t != "NONE" {
                    c.underline = true;
                }
            }
            b"strikeout" => {
                let t = get_attr(e, b"shape").unwrap_or_else(|| "NONE".into());
                if t != "NONE" {
                    c.strikethrough = true;
                }
            }
            _ => {}
        }
    }
    if let Some(p) = cur_para {
        match local {
            b"align" => {
                let h = get_attr(e, b"horizontal").unwrap_or_default();
                p.align = match h.as_str() {
                    "LEFT" => Align::Left,
                    "RIGHT" => Align::Right,
                    "CENTER" => Align::Center,
                    "JUSTIFY" => Align::Both,
                    "DISTRIBUTE" => Align::Distributed,
                    "DIVISION" => Align::Division,
                    _ => Align::Unknown,
                };
            }
            b"margin" => {
                if let Some(v) = get_attr_i32(e, b"left") {
                    p.left_margin = v;
                }
                if let Some(v) = get_attr_i32(e, b"right") {
                    p.right_margin = v;
                }
                if let Some(v) = get_attr_i32(e, b"indent") {
                    p.indent = v;
                }
            }
            _ => {}
        }
    }
}

fn get_attr(e: &BytesStart, key: &[u8]) -> Option<String> {
    for a in e.attributes().flatten() {
        if a.key.as_ref() == key || a.key.local_name().as_ref() == key {
            if let Ok(v) = a.unescape_value() {
                return Some(v.into_owned());
            }
        }
    }
    None
}

fn get_attr_u32(e: &BytesStart, key: &[u8]) -> Option<u32> {
    get_attr(e, key).and_then(|s| s.parse::<u32>().ok())
}

fn get_attr_i32(e: &BytesStart, key: &[u8]) -> Option<i32> {
    get_attr(e, key).and_then(|s| s.parse::<i32>().ok())
}

/// Parse `#RRGGBB` into HWP COLORREF `0x00BBGGRR`. Returns None on failure
/// or on the sentinel string "none".
fn parse_hex_color(s: &str) -> Option<u32> {
    let s = s.trim();
    if !s.starts_with('#') || s.len() != 7 {
        return None;
    }
    let r = u8::from_str_radix(&s[1..3], 16).ok()?;
    let g = u8::from_str_radix(&s[3..5], 16).ok()?;
    let b = u8::from_str_radix(&s[5..7], 16).ok()?;
    Some(((b as u32) << 16) | ((g as u32) << 8) | (r as u32))
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn color_parse() {
        // #1F4E79 -> r=0x1F,g=0x4E,b=0x79 -> 0x00794E1F
        assert_eq!(parse_hex_color("#1F4E79"), Some(0x00794E1F));
        assert_eq!(parse_hex_color("none"), None);
    }

    #[test]
    fn extracts_fontfaces_and_styles() {
        let xml = r#"<?xml version="1.0" encoding="UTF-8"?>
<hh:head xmlns:hh="http://www.hancom.co.kr/hwpml/2011/head">
  <hh:refList>
    <hh:fontfaces itemCnt="1">
      <hh:fontface lang="HANGUL" fontCnt="2">
        <hh:font id="0" face="함초롬돋움" type="TTF"/>
        <hh:font id="1" face="함초롬바탕" type="TTF"/>
      </hh:fontface>
      <hh:fontface lang="LATIN" fontCnt="1">
        <hh:font id="0" face="Arial" type="TTF"/>
      </hh:fontface>
    </hh:fontfaces>
    <hh:styles itemCnt="2">
      <hh:style id="0" type="PARA" name="바탕글" engName="Normal" paraPrIDRef="0" charPrIDRef="0"/>
      <hh:style id="1" type="PARA" name="본문" engName="Body" paraPrIDRef="1" charPrIDRef="0"/>
    </hh:styles>
  </hh:refList>
</hh:head>"#;
        let tables = parse(xml.as_bytes()).expect("parse");
        assert_eq!(tables.faces.len(), 2);
        assert_eq!(tables.faces[0].name, "함초롬돋움");
        assert_eq!(tables.faces[1].name, "함초롬바탕");
        assert_eq!(tables.styles.len(), 2);
        assert_eq!(tables.styles[0].local_name, "바탕글");
        assert_eq!(tables.styles[0].english_name, "Normal");
        assert_eq!(tables.styles[1].local_name, "본문");
        assert_eq!(tables.styles[1].char_shape_id, 0);
        assert_eq!(tables.styles[1].para_shape_id, 1);
    }
}