jw-hwp-core 0.1.0

Read-only parser for Hancom HWP 5.0 (binary CFB) and HWPX (OWPML) documents
Documentation
//! Minimal Microsoft OLE PropertySet reader.
//!
//! Supports VT_I4, VT_LPSTR (CP949/ASCII fallback), VT_LPWSTR, VT_FILETIME.
//! Returns values keyed by (section FMTID, property ID).

use crate::error::Error;

pub const VT_I2: u32 = 0x0002;
pub const VT_I4: u32 = 0x0003;
pub const VT_BOOL: u32 = 0x000B;
pub const VT_LPSTR: u32 = 0x001E;
pub const VT_LPWSTR: u32 = 0x001F;
pub const VT_FILETIME: u32 = 0x0040;

#[derive(Debug, Clone, PartialEq, Eq)]
pub enum PropValue {
    I4(i32),
    String(String),
    /// 100-nanosecond ticks since 1601-01-01 UTC (Windows FILETIME).
    FileTime(u64),
    /// Value we recognized the type of but chose not to decode.
    Skipped {
        vt: u32,
    },
}

#[derive(Debug, Default)]
pub struct PropertySection {
    pub fmtid: [u8; 16],
    pub properties: Vec<(u32, PropValue)>,
}

pub fn parse(bytes: &[u8]) -> Result<Vec<PropertySection>, Error> {
    if bytes.len() < 28 {
        return Err(Error::Record(format!(
            "PropertySet too short: {}",
            bytes.len()
        )));
    }
    if u16::from_le_bytes([bytes[0], bytes[1]]) != 0xFFFE {
        return Err(Error::Record("PropertySet: bad byte-order marker".into()));
    }
    let num_sections = u32::from_le_bytes(bytes[24..28].try_into().unwrap()) as usize;
    let mut sections = Vec::with_capacity(num_sections);
    let mut cursor = 28;
    for _ in 0..num_sections {
        if cursor + 20 > bytes.len() {
            return Err(Error::Record(
                "PropertySet: section header truncated".into(),
            ));
        }
        let mut fmtid = [0u8; 16];
        fmtid.copy_from_slice(&bytes[cursor..cursor + 16]);
        let section_offset =
            u32::from_le_bytes(bytes[cursor + 16..cursor + 20].try_into().unwrap()) as usize;
        cursor += 20;
        sections.push(parse_section(fmtid, bytes, section_offset)?);
    }
    Ok(sections)
}

fn parse_section(fmtid: [u8; 16], all: &[u8], start: usize) -> Result<PropertySection, Error> {
    if start + 8 > all.len() {
        return Err(Error::Record("PropertySet: section body truncated".into()));
    }
    let property_count = u32::from_le_bytes(all[start + 4..start + 8].try_into().unwrap()) as usize;
    let mut entries = Vec::with_capacity(property_count);
    let id_off_start = start + 8;
    for i in 0..property_count {
        let off = id_off_start + i * 8;
        if off + 8 > all.len() {
            return Err(Error::Record("PropertySet: id/off table truncated".into()));
        }
        let pid = u32::from_le_bytes(all[off..off + 4].try_into().unwrap());
        let value_off = u32::from_le_bytes(all[off + 4..off + 8].try_into().unwrap()) as usize;
        let abs = start + value_off;
        let value = read_value(all, abs)?;
        entries.push((pid, value));
    }
    Ok(PropertySection {
        fmtid,
        properties: entries,
    })
}

fn read_value(all: &[u8], at: usize) -> Result<PropValue, Error> {
    if at + 4 > all.len() {
        return Err(Error::Record("PropertySet: value type OOB".into()));
    }
    let vt = u32::from_le_bytes(all[at..at + 4].try_into().unwrap());
    let data = at + 4;
    match vt & 0xFFFF {
        VT_I4 => {
            if data + 4 > all.len() {
                return Err(Error::Record("VT_I4 truncated".into()));
            }
            Ok(PropValue::I4(i32::from_le_bytes(
                all[data..data + 4].try_into().unwrap(),
            )))
        }
        VT_FILETIME => {
            if data + 8 > all.len() {
                return Err(Error::Record("VT_FILETIME truncated".into()));
            }
            Ok(PropValue::FileTime(u64::from_le_bytes(
                all[data..data + 8].try_into().unwrap(),
            )))
        }
        VT_LPSTR => {
            if data + 4 > all.len() {
                return Err(Error::Record("VT_LPSTR length OOB".into()));
            }
            let len = u32::from_le_bytes(all[data..data + 4].try_into().unwrap()) as usize;
            let s_start = data + 4;
            let s_end = s_start
                .checked_add(len)
                .ok_or_else(|| Error::Record("VT_LPSTR overflow".into()))?;
            if s_end > all.len() {
                return Err(Error::Record("VT_LPSTR body OOB".into()));
            }
            let raw = &all[s_start..s_end];
            let trimmed = raw.split(|&b| b == 0).next().unwrap_or(raw);
            let s = match std::str::from_utf8(trimmed) {
                Ok(v) => v.to_string(),
                Err(_) => String::from_utf8_lossy(trimmed).into_owned(),
            };
            Ok(PropValue::String(s))
        }
        VT_LPWSTR => {
            if data + 4 > all.len() {
                return Err(Error::Record("VT_LPWSTR length OOB".into()));
            }
            let len_units = u32::from_le_bytes(all[data..data + 4].try_into().unwrap()) as usize;
            let s_start = data + 4;
            let s_end = s_start
                .checked_add(len_units.saturating_mul(2))
                .ok_or_else(|| Error::Record("VT_LPWSTR overflow".into()))?;
            if s_end > all.len() {
                return Err(Error::Record("VT_LPWSTR body OOB".into()));
            }
            let units: Vec<u16> = all[s_start..s_end]
                .chunks_exact(2)
                .map(|c| u16::from_le_bytes([c[0], c[1]]))
                .take_while(|u| *u != 0)
                .collect();
            Ok(PropValue::String(String::from_utf16_lossy(&units)))
        }
        _ => Ok(PropValue::Skipped { vt }),
    }
}

#[cfg(test)]
mod tests {
    use super::*;

    fn build_sample() -> Vec<u8> {
        let mut section = Vec::new();
        section.extend_from_slice(&0u32.to_le_bytes()); // size placeholder
        section.extend_from_slice(&2u32.to_le_bytes()); // property_count
        let table_off = section.len();
        section.extend_from_slice(&0u32.to_le_bytes());
        section.extend_from_slice(&0u32.to_le_bytes());
        section.extend_from_slice(&0u32.to_le_bytes());
        section.extend_from_slice(&0u32.to_le_bytes());
        let v1_off = section.len();
        section.extend_from_slice(&VT_I4.to_le_bytes());
        section.extend_from_slice(&42i32.to_le_bytes());
        let v2_off = section.len();
        section.extend_from_slice(&VT_LPWSTR.to_le_bytes());
        section.extend_from_slice(&3u32.to_le_bytes());
        section.extend_from_slice(&0x48u16.to_le_bytes());
        section.extend_from_slice(&0x69u16.to_le_bytes());
        section.extend_from_slice(&0x00u16.to_le_bytes());
        section[table_off..table_off + 4].copy_from_slice(&0x01u32.to_le_bytes());
        section[table_off + 4..table_off + 8].copy_from_slice(&(v1_off as u32).to_le_bytes());
        section[table_off + 8..table_off + 12].copy_from_slice(&0x02u32.to_le_bytes());
        section[table_off + 12..table_off + 16].copy_from_slice(&(v2_off as u32).to_le_bytes());
        let section_len = section.len() as u32;
        section[0..4].copy_from_slice(&section_len.to_le_bytes());

        let mut out = Vec::new();
        out.extend_from_slice(&0xFFFEu16.to_le_bytes());
        out.extend_from_slice(&0x0000u16.to_le_bytes());
        out.extend_from_slice(&0u32.to_le_bytes());
        out.extend_from_slice(&[0u8; 16]);
        out.extend_from_slice(&1u32.to_le_bytes());
        let fmtid = [0xAAu8; 16];
        out.extend_from_slice(&fmtid);
        let section_offset = (out.len() as u32 + 4).to_le_bytes();
        out.extend_from_slice(&section_offset);
        out.extend_from_slice(&section);
        out
    }

    #[test]
    fn parses_i4_and_lpwstr() {
        let bytes = build_sample();
        let sections = parse(&bytes).unwrap();
        assert_eq!(sections.len(), 1);
        let s = &sections[0];
        assert_eq!(s.fmtid, [0xAA; 16]);
        let map: std::collections::HashMap<u32, &PropValue> =
            s.properties.iter().map(|(k, v)| (*k, v)).collect();
        assert_eq!(map[&0x01], &PropValue::I4(42));
        assert_eq!(map[&0x02], &PropValue::String("Hi".into()));
    }

    #[test]
    fn rejects_bad_bom() {
        let mut b = vec![0u8; 32];
        b[0] = 0x11;
        assert!(parse(&b).is_err());
    }
}