jw-hwp-core 0.1.1

Read-only parser for Hancom HWP 5.0 (binary CFB) and HWPX (OWPML) documents
Documentation
const EXTENDED_CONTROLS: &[u16] = &[
    1, 2, 3, 4, 5, 6, 7, 8, 9, 11, 12, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23,
];

pub fn decode_para_text(payload: &[u8]) -> String {
    let units: Vec<u16> = payload
        .chunks_exact(2)
        .map(|c| u16::from_le_bytes([c[0], c[1]]))
        .collect();

    let mut out = String::new();
    let mut i = 0;
    while i < units.len() {
        let u = units[i];
        if EXTENDED_CONTROLS.contains(&u) {
            i += 8.min(units.len() - i);
            continue;
        }
        match u {
            0 => {
                i += 1;
            }
            10 | 13 => {
                out.push('\n');
                i += 1;
            }
            24..=31 => {
                i += 1;
            }
            _ => {
                if let Some(c) = char::from_u32(u as u32) {
                    out.push(c);
                }
                i += 1;
            }
        }
    }
    out
}

#[cfg(test)]
mod tests {
    use super::*;

    fn le(units: &[u16]) -> Vec<u8> {
        units.iter().flat_map(|u| u.to_le_bytes()).collect()
    }

    #[test]
    fn decodes_plain_ascii() {
        let bytes = le(&[0x0048, 0x0069]); // "Hi"
        assert_eq!(decode_para_text(&bytes), "Hi");
    }

    #[test]
    fn decodes_hangul() {
        // "가" = U+AC00
        let bytes = le(&[0xAC00]);
        assert_eq!(decode_para_text(&bytes), "");
    }

    #[test]
    fn linebreak_becomes_newline() {
        let bytes = le(&[0x0041, 10, 0x0042]);
        assert_eq!(decode_para_text(&bytes), "A\nB");
    }

    #[test]
    fn extended_control_skips_eight_units() {
        let bytes = le(&[2, 0, 0, 0, 0, 0, 0, 2, 0x0058]);
        assert_eq!(decode_para_text(&bytes), "X");
    }

    #[test]
    fn null_is_stripped() {
        let bytes = le(&[0x0041, 0, 0x0042]);
        assert_eq!(decode_para_text(&bytes), "AB");
    }
}