jw-hwp-core 0.1.0

Read-only parser for Hancom HWP 5.0 (binary CFB) and HWPX (OWPML) documents
Documentation
use crate::container::Container;
use crate::error::Error;
use crate::record::{
    RecordIter, HWPTAG_BIN_DATA, HWPTAG_CHAR_SHAPE, HWPTAG_DISTRIBUTE_DOC_DATA,
    HWPTAG_DOCUMENT_PROPERTIES, HWPTAG_FACE_NAME, HWPTAG_PARA_SHAPE, HWPTAG_STYLE,
};
use crate::shape::{parse_char_shape, parse_para_shape, ShapeTables};

pub fn read_shape_tables(container: &mut Container) -> Result<ShapeTables, Error> {
    let bytes = container.read_raw_stream("/DocInfo")?;
    let mut tables = ShapeTables::default();
    let (mut next_char_id, mut next_para_id) = (0u32, 0u32);
    for rec in RecordIter::new(&bytes) {
        let rec = rec?;
        match rec.header.tag_id {
            HWPTAG_CHAR_SHAPE => {
                if let Ok(cs) = parse_char_shape(rec.payload) {
                    tables.char_shapes.insert(next_char_id, cs);
                }
                next_char_id += 1;
            }
            HWPTAG_PARA_SHAPE => {
                if let Ok(ps) = parse_para_shape(rec.payload) {
                    tables.para_shapes.insert(next_para_id, ps);
                }
                next_para_id += 1;
            }
            HWPTAG_FACE_NAME => {
                if let Ok(f) = crate::faces::parse_face_name(rec.payload) {
                    tables.faces.push(f);
                }
            }
            HWPTAG_STYLE => {
                if let Ok(s) = crate::styles::parse_style(rec.payload) {
                    tables.styles.push(s);
                }
            }
            _ => {}
        }
    }
    // Populate CharShape.face_names by resolving face_ids against the face table.
    for cs in tables.char_shapes.values_mut() {
        for i in 0..7 {
            if let Some(f) = tables.faces.get(cs.face_ids[i] as usize) {
                cs.face_names[i] = f.name.clone();
            }
        }
    }
    Ok(tables)
}

pub fn read_asset_catalog(container: &mut Container) -> Result<crate::assets::AssetCatalog, Error> {
    let bytes = container.read_raw_stream("/DocInfo")?;
    let mut entries = Vec::new();
    let mut positional = 1u16;
    for rec in RecordIter::new(&bytes) {
        let rec = rec?;
        if rec.header.tag_id == HWPTAG_BIN_DATA {
            if let Ok(mut e) = crate::assets::parse_bin_data(rec.payload, positional) {
                if e.kind == "EMBEDDING" {
                    let stream_name = format!("/BinData/BIN{:04X}.{}", e.id, e.format);
                    e.size_bytes = container.stream_size(&stream_name);
                }
                entries.push(e);
            }
            positional += 1;
        }
    }
    Ok(crate::assets::AssetCatalog { entries })
}

#[derive(Debug, Clone, Copy, Default, PartialEq, Eq, serde::Serialize)]
pub struct DocumentProperties {
    pub section_count: u16,
    pub page_start: u16,
    pub footnote_start: u16,
    pub endnote_start: u16,
    pub picture_start: u16,
    pub table_start: u16,
    pub equation_start: u16,
    pub caret_section: u32,
    pub caret_position: u32,
}

pub fn read(container: &mut Container) -> Result<DocumentProperties, Error> {
    let bytes = container.read_raw_stream("/DocInfo")?;
    for rec in RecordIter::new(&bytes) {
        let rec = rec?;
        if rec.header.tag_id == HWPTAG_DOCUMENT_PROPERTIES {
            return parse(rec.payload);
        }
    }
    Err(Error::Record(
        "DocInfo missing HWPTAG_DOCUMENT_PROPERTIES".into(),
    ))
}

fn parse(p: &[u8]) -> Result<DocumentProperties, Error> {
    if p.len() < 22 {
        return Err(Error::Record(format!(
            "DocumentProperties too short: {}",
            p.len()
        )));
    }
    let u16_at = |o: usize| u16::from_le_bytes(p[o..o + 2].try_into().unwrap());
    let u32_at = |o: usize| u32::from_le_bytes(p[o..o + 4].try_into().unwrap());
    Ok(DocumentProperties {
        section_count: u16_at(0),
        page_start: u16_at(2),
        footnote_start: u16_at(4),
        endnote_start: u16_at(6),
        picture_start: u16_at(8),
        table_start: u16_at(10),
        equation_start: u16_at(12),
        caret_section: u32_at(14),
        caret_position: u32_at(18),
    })
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn parses_fixed_layout() {
        let mut p = Vec::new();
        p.extend_from_slice(&3u16.to_le_bytes());
        p.extend_from_slice(&1u16.to_le_bytes());
        p.extend_from_slice(&1u16.to_le_bytes());
        p.extend_from_slice(&1u16.to_le_bytes());
        p.extend_from_slice(&1u16.to_le_bytes());
        p.extend_from_slice(&1u16.to_le_bytes());
        p.extend_from_slice(&1u16.to_le_bytes());
        p.extend_from_slice(&0u32.to_le_bytes());
        p.extend_from_slice(&42u32.to_le_bytes());
        p.extend_from_slice(&[0u8; 8]);

        let d = parse(&p).unwrap();
        assert_eq!(d.section_count, 3);
        assert_eq!(d.caret_position, 42);
    }

    #[test]
    fn rejects_short_payload() {
        let p = vec![0u8; 10];
        assert!(parse(&p).is_err());
    }
}

/// Scan a decompressed DocInfo stream for the 256-byte HWPTAG_DISTRIBUTE_DOC_DATA record payload.
/// Returns `None` if absent.
pub fn find_distribute_doc_data(doc_info_bytes: &[u8]) -> Option<Vec<u8>> {
    for rec in RecordIter::new(doc_info_bytes) {
        let rec = rec.ok()?;
        if rec.header.tag_id == HWPTAG_DISTRIBUTE_DOC_DATA {
            return Some(rec.payload.to_vec());
        }
    }
    None
}

#[cfg(test)]
mod distribute_tests {
    use super::*;
    use crate::record::{HWPTAG_DISTRIBUTE_DOC_DATA, HWPTAG_DOCUMENT_PROPERTIES};

    fn encode_header(tag: u16, level: u16, size: u32) -> Vec<u8> {
        let word = (tag as u32 & 0x3FF) | ((level as u32 & 0x3FF) << 10) | (size.min(0xFFF) << 20);
        let mut v = word.to_le_bytes().to_vec();
        if size >= 0xFFF {
            v.extend_from_slice(&size.to_le_bytes());
        }
        v
    }

    #[test]
    fn finds_record_when_present() {
        let payload = vec![0xAAu8; 256];
        let mut buf = encode_header(HWPTAG_DISTRIBUTE_DOC_DATA, 0, 256);
        buf.extend_from_slice(&payload);
        let out = find_distribute_doc_data(&buf).unwrap();
        assert_eq!(out, payload);
    }

    #[test]
    fn returns_none_when_absent() {
        let mut buf = encode_header(HWPTAG_DOCUMENT_PROPERTIES, 0, 30);
        buf.extend_from_slice(&[0u8; 30]);
        assert!(find_distribute_doc_data(&buf).is_none());
    }
}

#[cfg(test)]
mod asset_tests {
    use crate::assets::parse_bin_data;

    fn encode_wstr(s: &str) -> Vec<u8> {
        let units: Vec<u16> = s.encode_utf16().collect();
        let mut v = Vec::new();
        v.extend_from_slice(&(units.len() as u16).to_le_bytes());
        for u in units {
            v.extend_from_slice(&u.to_le_bytes());
        }
        v
    }

    #[test]
    fn parses_embedding_bin_data_record() {
        let mut p = Vec::new();
        p.extend_from_slice(&1u16.to_le_bytes()); // props -> EMBEDDING
        p.extend_from_slice(&encode_wstr("BIN0001.png"));
        p.extend_from_slice(&encode_wstr("png"));
        let e = parse_bin_data(&p, 1).unwrap();
        assert_eq!(e.id, 1);
        assert_eq!(e.kind, "EMBEDDING");
        assert_eq!(e.format, "png");
        assert!(e.size_bytes.is_none());
    }
}