jw-hwp-core 0.1.0

Read-only parser for Hancom HWP 5.0 (binary CFB) and HWPX (OWPML) documents
Documentation
//! Normalized CharShape / ParaShape and their DocInfo record parsers.

use crate::error::Error;
use serde::Serialize;
use std::collections::HashMap;

#[derive(Debug, Clone, Default, Serialize, PartialEq)]
pub struct CharShape {
    /// Font face ID per language (HWP has 7 lang slots: Hangul/English/Hanja/Japanese/Other/Symbol/User).
    pub face_ids: [u16; 7],
    /// Resolved font names per language slot (populated after FaceName parsing).
    #[serde(default)]
    pub face_names: [String; 7],
    /// Base font size in points (HWP stores as pt*100, we expose as f32 points).
    pub size_pt: f32,
    pub italic: bool,
    pub bold: bool,
    pub underline: bool,
    pub strikethrough: bool,
    pub superscript: bool,
    pub subscript: bool,
    /// Text color as 0xBBGGRR (HWP COLORREF).
    pub color: u32,
}

#[derive(Debug, Clone, Serialize, PartialEq, Eq)]
#[serde(rename_all = "snake_case")]
pub enum Align {
    Both,
    Left,
    Right,
    Center,
    Distributed,
    Division,
    Unknown,
}

#[derive(Debug, Clone, Serialize, PartialEq, Eq)]
pub struct ParaShape {
    pub align: Align,
    pub left_margin: i32,
    pub right_margin: i32,
    pub indent: i32,
    pub space_before: i32,
    pub space_after: i32,
    /// Line spacing: legacy INT32 field for pre-5.0.2.5, extended UINT32 at offset 50 for newer docs.
    pub line_spacing: i32,
}

#[derive(Debug, Clone, Default, Serialize, PartialEq)]
pub struct ShapeTables {
    /// char_shape_id → CharShape. IDs are implicit: the Nth HWPTAG_CHAR_SHAPE record in DocInfo has id N.
    pub char_shapes: HashMap<u32, CharShape>,
    pub para_shapes: HashMap<u32, ParaShape>,
    pub faces: Vec<crate::faces::FaceName>,
    pub styles: Vec<crate::styles::Style>,
}

pub fn parse_char_shape(p: &[u8]) -> Result<CharShape, Error> {
    if p.len() < 50 {
        return Err(Error::Record(format!("CharShape too short: {}", p.len())));
    }
    let mut face_ids = [0u16; 7];
    for i in 0..7 {
        face_ids[i] = u16::from_le_bytes(p[2 * i..2 * i + 2].try_into().unwrap());
    }
    let base_size = i32::from_le_bytes(p[42..46].try_into().unwrap());
    let size_pt = base_size as f32 / 100.0;
    let props = u32::from_le_bytes(p[46..50].try_into().unwrap());
    let italic = props & 0x1 != 0;
    let bold = props & 0x2 != 0;
    // underline kind is bits 2..4; 0 = none, others = some underline
    let underline = (props >> 2) & 0x3 != 0;
    // strike kind bits 18..21 (spec表35 bit 18..20); non-zero means strikethrough
    let strikethrough = (props >> 18) & 0x7 != 0;
    let superscript = props & (1 << 15) != 0;
    let subscript = props & (1 << 16) != 0;
    let color = if p.len() >= 56 {
        u32::from_le_bytes(p[52..56].try_into().unwrap())
    } else {
        0
    };

    Ok(CharShape {
        face_ids,
        face_names: Default::default(),
        size_pt,
        italic,
        bold,
        underline,
        strikethrough,
        superscript,
        subscript,
        color,
    })
}

pub fn parse_para_shape(p: &[u8]) -> Result<ParaShape, Error> {
    if p.len() < 42 {
        return Err(Error::Record(format!("ParaShape too short: {}", p.len())));
    }
    let props1 = u32::from_le_bytes(p[0..4].try_into().unwrap());
    let align = match (props1 >> 2) & 0x7 {
        0 => Align::Both,
        1 => Align::Left,
        2 => Align::Right,
        3 => Align::Center,
        4 => Align::Distributed,
        5 => Align::Division,
        _ => Align::Unknown,
    };
    let left_margin = i32::from_le_bytes(p[4..8].try_into().unwrap());
    let right_margin = i32::from_le_bytes(p[8..12].try_into().unwrap());
    let indent = i32::from_le_bytes(p[12..16].try_into().unwrap());
    let space_before = i32::from_le_bytes(p[16..20].try_into().unwrap());
    let space_after = i32::from_le_bytes(p[20..24].try_into().unwrap());
    let legacy_line_spacing = i32::from_le_bytes(p[24..28].try_into().unwrap());
    let line_spacing = if p.len() >= 54 {
        i32::from_le_bytes(p[50..54].try_into().unwrap())
    } else {
        legacy_line_spacing
    };
    Ok(ParaShape {
        align,
        left_margin,
        right_margin,
        indent,
        space_before,
        space_after,
        line_spacing,
    })
}

#[cfg(test)]
mod tests {
    use super::*;

    fn build_char_shape(base_size: i32, italic: bool, bold: bool) -> Vec<u8> {
        let mut p = vec![0u8; 72];
        p[42..46].copy_from_slice(&base_size.to_le_bytes());
        let mut props: u32 = 0;
        if italic {
            props |= 1;
        }
        if bold {
            props |= 2;
        }
        p[46..50].copy_from_slice(&props.to_le_bytes());
        p[52..56].copy_from_slice(&0x00112233u32.to_le_bytes());
        p
    }

    #[test]
    fn parses_bold_italic_char_shape() {
        let p = build_char_shape(1200, true, true);
        let cs = parse_char_shape(&p).unwrap();
        assert_eq!(cs.size_pt, 12.0);
        assert!(cs.italic);
        assert!(cs.bold);
        assert!(!cs.underline);
        assert_eq!(cs.color, 0x00112233);
    }

    #[test]
    fn parses_para_shape_alignment() {
        let mut p = vec![0u8; 54];
        // align = right = 2, stored in bits 2..4
        let props1: u32 = 2 << 2;
        p[0..4].copy_from_slice(&props1.to_le_bytes());
        p[4..8].copy_from_slice(&100i32.to_le_bytes());
        p[12..16].copy_from_slice(&200i32.to_le_bytes());
        let ps = parse_para_shape(&p).unwrap();
        assert_eq!(ps.align, Align::Right);
        assert_eq!(ps.left_margin, 100);
        assert_eq!(ps.indent, 200);
    }
}