jw-hwp-core 0.1.0

Read-only parser for Hancom HWP 5.0 (binary CFB) and HWPX (OWPML) documents
Documentation
//! HWP distributed-document key derivation and section decryption.
//!
//! Algorithm reverse-engineered from pyhwp's hwp5/distdoc.py and corroborated
//! against real distributed HWP files. The HWP 5.0 spec does not document
//! this pipeline.

use crate::error::Error;

/// MSVC-compatible linear congruential RNG used during key derivation.
pub(crate) struct MsvcRand {
    state: u32,
}

impl MsvcRand {
    pub fn new(seed: u32) -> Self {
        Self { state: seed }
    }

    /// Returns the 15-bit value `(state >> 16) & 0x7FFF` after advancing state.
    pub fn next_15bit(&mut self) -> u32 {
        self.state = self.state.wrapping_mul(214013).wrapping_add(2531011);
        (self.state >> 16) & 0x7FFF
    }

    pub fn next_byte(&mut self) -> u8 {
        (self.next_15bit() & 0xFF) as u8
    }

    /// Run-length nibble + 1, in 1..=16.
    pub fn next_run_length(&mut self) -> usize {
        ((self.next_15bit() & 0x0F) as usize) + 1
    }
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn msvc_rand_matches_reference_sequence() {
        let mut r = MsvcRand::new(1);
        let first_five: Vec<u32> = (0..5).map(|_| r.next_15bit()).collect();
        assert_eq!(first_five, vec![41, 18467, 6334, 26500, 19169]);
    }

    #[test]
    fn seed_zero_still_advances() {
        let mut r = MsvcRand::new(0);
        assert_eq!(r.next_15bit(), 38);
    }
}

pub const DIST_DOC_RECORD_LEN: usize = 256;

/// In-place RLE-XOR decode of bytes `[4..256)` of the DISTRIBUTE_DOC_DATA payload.
/// The RNG is seeded from `u32::from_le_bytes(data[0..4])`.
pub(crate) fn rle_xor_decode(data: &mut [u8]) -> Result<(), Error> {
    if data.len() < DIST_DOC_RECORD_LEN {
        return Err(Error::Record(format!(
            "DISTRIBUTE_DOC_DATA too short: {} (need {})",
            data.len(),
            DIST_DOC_RECORD_LEN
        )));
    }
    let seed = u32::from_le_bytes(data[0..4].try_into().unwrap());
    let mut rng = MsvcRand::new(seed);
    let mut key_byte: u8 = 0;
    let mut run_left: usize = 0;
    // pyhwp iterates the full 0..256 range; n is decremented every iteration
    // (including the first 4 seed bytes), but XOR is only applied at i >= 4.
    for (i, byte) in data.iter_mut().enumerate().take(DIST_DOC_RECORD_LEN) {
        if run_left == 0 {
            key_byte = rng.next_byte();
            run_left = rng.next_run_length();
        }
        if i >= 4 {
            *byte ^= key_byte;
        }
        run_left -= 1;
    }
    Ok(())
}

/// Given the 256-byte DISTRIBUTE_DOC_DATA payload, derive the 16-byte AES-128 key.
///
/// The key is the first 16 bytes of the 80-byte UCS-16LE-encoded SHA-1 hex digest
/// at offset `4 + (seed & 0xF)` after RLE-XOR decoding — NOT the hex-decoded raw digest.
/// Empirically validated: using the raw UCS-16LE bytes as-is yields a working AES key;
/// using hex-decoded SHA-1 bytes does not.
pub(crate) fn derive_aes_key(record: &[u8]) -> Result<[u8; 16], Error> {
    if record.len() != DIST_DOC_RECORD_LEN {
        return Err(Error::Record(format!(
            "DISTRIBUTE_DOC_DATA must be {} bytes, got {}",
            DIST_DOC_RECORD_LEN,
            record.len()
        )));
    }
    let mut buf = record.to_vec();
    rle_xor_decode(&mut buf)?;

    let seed = u32::from_le_bytes(buf[0..4].try_into().unwrap());
    let offset = 4 + (seed as usize & 0x0F);
    if offset + 80 > buf.len() {
        return Err(Error::Record("DISTRIBUTE_DOC_DATA hex window OOB".into()));
    }
    // Validate the window looks like UCS-16LE ASCII hex (every odd byte should be 0, every
    // even byte should be an ASCII hex char) — catches misaligned decryption early.
    for (i, &b) in buf[offset..offset + 80].iter().enumerate() {
        let ok = if i % 2 == 0 {
            b.is_ascii_hexdigit()
        } else {
            b == 0
        };
        if !ok {
            return Err(Error::Record(
                "DISTRIBUTE_DOC_DATA: hex window not valid UCS-16LE ASCII — wrong key?".into(),
            ));
        }
    }

    let mut key = [0u8; 16];
    key.copy_from_slice(&buf[offset..offset + 16]);
    Ok(key)
}

#[cfg(test)]
mod rle_tests {
    use super::*;

    fn rle_xor_roundtrip_encode(data: &mut [u8]) {
        rle_xor_decode(data).unwrap();
    }

    #[test]
    fn xor_is_involutive() {
        let mut data = [0u8; DIST_DOC_RECORD_LEN];
        data[0..4].copy_from_slice(&0xDEADBEEF_u32.to_le_bytes());
        for b in data[4..].iter_mut() {
            *b = 0x55;
        }
        let original = data;
        rle_xor_roundtrip_encode(&mut data);
        assert_ne!(data[4..], original[4..]);
        rle_xor_roundtrip_encode(&mut data);
        assert_eq!(data[4..], original[4..]);
    }

    #[test]
    fn derive_key_from_synthetic_record() {
        // Build a plaintext record with seed=0 so offset = 4 + 0 = 4.
        // Place 40 ASCII hex chars at offset 4 as UCS-16LE.
        let hex = b"0123456789abcdef0123456789abcdef01234567";
        let mut plaintext = vec![0u8; 256];
        for (i, &c) in hex.iter().enumerate() {
            plaintext[4 + 2 * i] = c;
            plaintext[4 + 2 * i + 1] = 0;
        }
        rle_xor_roundtrip_encode(&mut plaintext);
        let key = derive_aes_key(&plaintext).unwrap();

        // Expected key = first 16 UCS-16LE bytes: first 8 hex chars each followed by 0x00.
        let mut expected = [0u8; 16];
        for i in 0..8 {
            expected[2 * i] = hex[i];
            expected[2 * i + 1] = 0;
        }
        assert_eq!(key, expected);
    }
}

use aes::cipher::{BlockDecrypt, KeyInit};
use aes::Aes128;

/// Decrypt `ciphertext` in place using AES-128-ECB. Any trailing non-16-byte tail is truncated.
pub(crate) fn aes128_ecb_decrypt(key: &[u8; 16], ciphertext: &[u8]) -> Vec<u8> {
    let cipher = Aes128::new(key.into());
    let mut out = Vec::with_capacity(ciphertext.len());
    for block in ciphertext.chunks_exact(16) {
        let mut b = *aes::Block::from_slice(block);
        cipher.decrypt_block(&mut b);
        out.extend_from_slice(&b);
    }
    out
}

#[cfg(test)]
mod aes_tests {
    use super::*;
    use aes::cipher::BlockEncrypt;

    #[test]
    fn decrypt_inverts_encrypt() {
        let key = [0x01u8; 16];
        let cipher = Aes128::new((&key).into());
        let plaintext = b"sixteen bytes!!!";
        let mut enc = *aes::Block::from_slice(plaintext);
        cipher.encrypt_block(&mut enc);

        let out = aes128_ecb_decrypt(&key, enc.as_slice());
        assert_eq!(out, plaintext.to_vec());
    }

    #[test]
    fn truncates_partial_trailing_block() {
        let key = [0x42u8; 16];
        let mut ciphertext = vec![0u8; 32];
        ciphertext.extend_from_slice(&[0u8; 7]);
        let out = aes128_ecb_decrypt(&key, &ciphertext);
        assert_eq!(out.len(), 32);
    }
}