jw-hwp-core 0.1.0

Read-only parser for Hancom HWP 5.0 (binary CFB) and HWPX (OWPML) documents
Documentation
//! HWPTAG_BIN_DATA parser + asset catalog.

use crate::error::Error;

#[derive(Debug, Clone, Default, PartialEq, Eq, serde::Serialize)]
pub struct BinDataEntry {
    /// 1-based positional id (`"BIN0001"` → 1). The binary record is implicit-ordered.
    pub id: u16,
    /// e.g. `"BIN0001.png"` or path fragment for LINK items.
    pub name: String,
    /// `"EMBEDDING"` | `"LINK"` | `"STORAGE"` from properties bits 0..3.
    pub kind: String,
    /// lowercase file extension extracted from `name`; empty if none.
    pub format: String,
    /// Byte size of the underlying stream, if the binary could resolve it.
    pub size_bytes: Option<u64>,
}

#[derive(Debug, Clone, Default, PartialEq, Eq, serde::Serialize)]
pub struct AssetCatalog {
    pub entries: Vec<BinDataEntry>,
}

pub fn parse_bin_data(p: &[u8], positional_id: u16) -> Result<BinDataEntry, Error> {
    if p.len() < 2 {
        return Err(Error::Record("BinData: truncated".into()));
    }
    let props = u16::from_le_bytes(p[0..2].try_into().unwrap());
    let kind_bits = (props & 0x000F) as u8;
    let kind = match kind_bits {
        0 => "LINK",
        1 => "EMBEDDING",
        2 => "STORAGE",
        _ => "OTHER",
    }
    .to_string();
    let mut cur = 2usize;
    // LINK has absolute-path + relative-path wstrs; EMBEDDING/STORAGE have name + format (both wstr).
    let name = read_wstr(p, &mut cur).unwrap_or_default();
    let format = read_wstr(p, &mut cur).unwrap_or_default();
    let format = if !format.is_empty() {
        format.to_ascii_lowercase()
    } else {
        name.rsplit('.')
            .next()
            .map(|s| s.to_ascii_lowercase())
            .filter(|s| s != &name.to_ascii_lowercase())
            .unwrap_or_default()
    };
    Ok(BinDataEntry {
        id: positional_id,
        name,
        kind,
        format,
        size_bytes: None,
    })
}

fn read_wstr(p: &[u8], cur: &mut usize) -> Option<String> {
    if *cur + 2 > p.len() {
        return None;
    }
    let len = u16::from_le_bytes(p[*cur..*cur + 2].try_into().ok()?) as usize;
    *cur += 2;
    let nbytes = len.checked_mul(2)?;
    if *cur + nbytes > p.len() {
        return None;
    }
    let mut units = Vec::with_capacity(len);
    for i in 0..len {
        units.push(u16::from_le_bytes(
            p[*cur + i * 2..*cur + i * 2 + 2].try_into().ok()?,
        ));
    }
    *cur += nbytes;
    Some(String::from_utf16_lossy(&units))
}

#[cfg(test)]
mod tests {
    use super::*;

    fn encode_wstr(s: &str) -> Vec<u8> {
        let units: Vec<u16> = s.encode_utf16().collect();
        let mut v = Vec::new();
        v.extend_from_slice(&(units.len() as u16).to_le_bytes());
        for u in units {
            v.extend_from_slice(&u.to_le_bytes());
        }
        v
    }

    #[test]
    fn parses_embedding_entry() {
        let mut p = Vec::new();
        p.extend_from_slice(&1u16.to_le_bytes()); // props -> EMBEDDING
        p.extend_from_slice(&encode_wstr("BIN0001.png"));
        p.extend_from_slice(&encode_wstr("png"));
        let e = parse_bin_data(&p, 1).unwrap();
        assert_eq!(e.id, 1);
        assert_eq!(e.kind, "EMBEDDING");
        assert_eq!(e.format, "png");
        assert_eq!(e.name, "BIN0001.png");
    }

    #[test]
    fn rejects_truncated() {
        assert!(parse_bin_data(&[], 1).is_err());
    }
}