jw-hwp-core 0.1.0

Read-only parser for Hancom HWP 5.0 (binary CFB) and HWPX (OWPML) documents
Documentation
use crate::error::Error;
use std::io::Read;

pub const FILE_HEADER_LEN: usize = 256;
pub const HWP_SIGNATURE: &[u8] = b"HWP Document File";

#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub struct FileHeader {
    pub version: Version,
    pub compressed: bool,
    pub encrypted: bool,
    pub distributed: bool,
}

#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub struct Version {
    pub major: u8,
    pub minor: u8,
    pub build: u8,
    pub revision: u8,
}

impl FileHeader {
    pub fn parse(bytes: &[u8]) -> Result<Self, Error> {
        if bytes.len() < FILE_HEADER_LEN {
            return Err(Error::InvalidHeader(format!(
                "expected {} bytes, got {}",
                FILE_HEADER_LEN,
                bytes.len()
            )));
        }
        if !bytes[..HWP_SIGNATURE.len()].eq(HWP_SIGNATURE) {
            return Err(Error::InvalidHeader("signature mismatch".into()));
        }
        // version is little-endian at offset 32: [revision, build, minor, major]
        let v = &bytes[32..36];
        let version = Version {
            revision: v[0],
            build: v[1],
            minor: v[2],
            major: v[3],
        };
        let props = u32::from_le_bytes(bytes[36..40].try_into().unwrap());
        Ok(FileHeader {
            version,
            compressed: props & 0b0001 != 0,
            encrypted: props & 0b0010 != 0,
            distributed: props & 0b0100 != 0,
        })
    }
}

pub fn read_all<R: Read>(mut r: R) -> Result<Vec<u8>, Error> {
    let mut buf = Vec::new();
    r.read_to_end(&mut buf).map_err(Error::Io)?;
    Ok(buf)
}

use cfb::CompoundFile;
use flate2::read::DeflateDecoder;
use std::fs::File;
use std::path::Path;

pub struct Container {
    cfb: CompoundFile<File>,
    pub header: FileHeader,
    #[allow(dead_code)]
    pub(crate) aes_key: Option<[u8; 16]>,
}

impl Container {
    pub fn open(path: &Path) -> Result<Self, Error> {
        if !path.exists() {
            return Err(Error::NotFound(path.to_path_buf()));
        }
        let cfb =
            CompoundFile::open(File::open(path)?).map_err(|e| Error::Container(e.to_string()))?;
        let mut this = Self {
            cfb,
            header: FileHeader {
                version: Version {
                    major: 0,
                    minor: 0,
                    build: 0,
                    revision: 0,
                },
                compressed: false,
                encrypted: false,
                distributed: false,
            },
            aes_key: None,
        };
        this.header = this.read_file_header()?;
        if this.header.encrypted {
            return Err(Error::Container(
                "password-encrypted HWP not supported".into(),
            ));
        }
        Ok(this)
    }

    fn read_file_header(&mut self) -> Result<FileHeader, Error> {
        let mut s = self
            .cfb
            .open_stream("/FileHeader")
            .map_err(|_| Error::MissingStream("FileHeader".into()))?;
        let bytes = read_all(&mut s)?;
        FileHeader::parse(&bytes)
    }

    /// Returns the on-disk byte length of a stream if it exists, else `None`.
    pub fn stream_size(&mut self, stream_name: &str) -> Option<u64> {
        use std::io::{Seek, SeekFrom};
        if !self.cfb.exists(stream_name) {
            return None;
        }
        let mut s = self.cfb.open_stream(stream_name).ok()?;
        s.seek(SeekFrom::End(0)).ok()
    }

    /// Returns names like "Section0", "Section1", ... in ascending index order.
    pub fn section_names(&self) -> Vec<String> {
        let storage = if self.header.distributed {
            "/ViewText"
        } else {
            "/BodyText"
        };
        let mut out: Vec<(u32, String)> = self
            .cfb
            .read_storage(storage)
            .into_iter()
            .flatten()
            .filter_map(|entry| {
                let name = entry.name().to_string();
                let idx: u32 = name.strip_prefix("Section")?.parse().ok()?;
                Some((idx, name))
            })
            .collect();
        out.sort_by_key(|(i, _)| *i);
        out.into_iter().map(|(_, n)| n).collect()
    }

    pub fn read_raw_stream(&mut self, path: &str) -> Result<Vec<u8>, Error> {
        let mut s = self
            .cfb
            .open_stream(path)
            .map_err(|_| Error::MissingStream(path.into()))?;
        let raw = read_all(&mut s)?;
        let decompress =
            self.header.compressed && (path == "/DocInfo" || path.starts_with("/BodyText/"));
        if decompress {
            let mut dec = DeflateDecoder::new(&raw[..]);
            let mut out = Vec::new();
            dec.read_to_end(&mut out).map_err(|e| Error::Decompress {
                stream: path.into(),
                source: e,
            })?;
            Ok(out)
        } else {
            Ok(raw)
        }
    }

    /// Reads and (if needed) decompresses a section stream.
    pub fn read_section(&mut self, name: &str) -> Result<Vec<u8>, Error> {
        if self.header.distributed {
            self.read_distributed_section(name)
        } else {
            self.read_raw_stream(&format!("/BodyText/{}", name))
        }
    }

    fn read_distributed_section(&mut self, name: &str) -> Result<Vec<u8>, Error> {
        let stream_path = format!("/ViewText/{}", name);
        let mut s = self
            .cfb
            .open_stream(&stream_path)
            .map_err(|_| Error::MissingStream(stream_path.clone()))?;
        let raw = read_all(&mut s)?;
        if raw.len() < 260 {
            return Err(Error::Container(format!(
                "distributed {} too short: {} bytes (need >= 260)",
                stream_path,
                raw.len()
            )));
        }
        let word = u32::from_le_bytes(raw[0..4].try_into().unwrap());
        let tag_id = (word & 0x3FF) as u16;
        if tag_id != crate::record::HWPTAG_DISTRIBUTE_DOC_DATA {
            return Err(Error::Container(format!(
                "distributed {} missing DISTRIBUTE_DOC_DATA header (tag_id=0x{:03X})",
                stream_path, tag_id
            )));
        }
        let payload = &raw[4..260];
        let key = crate::dist::derive_aes_key(payload)?;
        let encrypted_body = &raw[260..];
        let decrypted = crate::dist::aes128_ecb_decrypt(&key, encrypted_body);
        if self.header.compressed {
            let mut dec = DeflateDecoder::new(&decrypted[..]);
            let mut out = Vec::new();
            dec.read_to_end(&mut out).map_err(|e| Error::Decompress {
                stream: stream_path,
                source: e,
            })?;
            Ok(out)
        } else {
            Ok(decrypted)
        }
    }
}

#[cfg(test)]
mod open_tests {
    use super::*;

    #[test]
    fn open_missing_file_returns_notfound() {
        let res = Container::open(Path::new("/tmp/does-not-exist-hwp.hwp"));
        assert!(matches!(res, Err(Error::NotFound(_))));
    }
}

#[cfg(test)]
mod tests {
    use super::*;

    fn make_header(compressed: bool) -> Vec<u8> {
        let mut buf = vec![0u8; FILE_HEADER_LEN];
        buf[..HWP_SIGNATURE.len()].copy_from_slice(HWP_SIGNATURE);
        // version 5.0.3.0 -> stored LE as [0, 3, 0, 5]
        buf[32..36].copy_from_slice(&[0, 3, 0, 5]);
        let props: u32 = if compressed { 1 } else { 0 };
        buf[36..40].copy_from_slice(&props.to_le_bytes());
        buf
    }

    #[test]
    fn parses_valid_header_compressed() {
        let hdr = FileHeader::parse(&make_header(true)).unwrap();
        assert_eq!(
            hdr.version,
            Version {
                major: 5,
                minor: 0,
                build: 3,
                revision: 0
            }
        );
        assert!(hdr.compressed);
        assert!(!hdr.encrypted);
    }

    #[test]
    fn rejects_bad_signature() {
        let mut buf = make_header(true);
        buf[0] = b'X';
        let err = FileHeader::parse(&buf).unwrap_err();
        assert!(matches!(err, Error::InvalidHeader(_)));
    }

    #[test]
    fn rejects_short_input() {
        let buf = vec![0u8; 10];
        assert!(FileHeader::parse(&buf).is_err());
    }
}