jw-hwp-core 0.1.1

Read-only parser for Hancom HWP 5.0 (binary CFB) and HWPX (OWPML) documents
Documentation
use crate::error::Error;

pub const HWPTAG_BEGIN: u16 = 0x010;
pub const HWPTAG_DOCUMENT_PROPERTIES: u16 = HWPTAG_BEGIN; // 0x010
pub const HWPTAG_BIN_DATA: u16 = HWPTAG_BEGIN + 1; // 0x011
pub const HWPTAG_FACE_NAME: u16 = HWPTAG_BEGIN + 2; // 0x012
pub const HWPTAG_DISTRIBUTE_DOC_DATA: u16 = HWPTAG_BEGIN + 12; // 0x01C
pub const HWPTAG_CHAR_SHAPE: u16 = HWPTAG_BEGIN + 5; // 0x015
pub const HWPTAG_PARA_SHAPE: u16 = HWPTAG_BEGIN + 9; // 0x019
pub const HWPTAG_STYLE: u16 = HWPTAG_BEGIN + 10; // 0x01A
pub const HWPTAG_PARA_HEADER: u16 = HWPTAG_BEGIN + 50; // 0x042
pub const HWPTAG_PARA_TEXT: u16 = HWPTAG_BEGIN + 51; // 0x043
pub const HWPTAG_PARA_CHAR_SHAPE: u16 = HWPTAG_BEGIN + 52; // 0x044
pub const HWPTAG_CTRL_HEADER: u16 = HWPTAG_BEGIN + 55; // 0x047
pub const HWPTAG_LIST_HEADER: u16 = HWPTAG_BEGIN + 56; // 0x048
pub const HWPTAG_TABLE: u16 = HWPTAG_BEGIN + 61; // 0x04D
pub const HWPTAG_EQEDIT: u16 = HWPTAG_BEGIN + 80; // 0x060
pub const HWPTAG_SHAPE_COMPONENT_PICTURE: u16 = HWPTAG_BEGIN + 76; // 0x05C

#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub struct RecordHeader {
    pub tag_id: u16,
    pub level: u16,
    pub size: u32,
}

#[derive(Debug, Clone)]
pub struct Record<'a> {
    pub header: RecordHeader,
    pub payload: &'a [u8],
}

pub struct RecordIter<'a> {
    data: &'a [u8],
    pos: usize,
}

impl<'a> RecordIter<'a> {
    pub fn new(data: &'a [u8]) -> Self {
        Self { data, pos: 0 }
    }
    pub fn offset(&self) -> usize {
        self.pos
    }
    pub fn set_offset(&mut self, pos: usize) {
        self.pos = pos;
    }
}

impl<'a> Iterator for RecordIter<'a> {
    type Item = Result<Record<'a>, Error>;
    fn next(&mut self) -> Option<Self::Item> {
        if self.pos >= self.data.len() {
            return None;
        }
        if self.pos + 4 > self.data.len() {
            return Some(Err(Error::Record(format!(
                "truncated header at {}",
                self.pos
            ))));
        }
        let word = u32::from_le_bytes(self.data[self.pos..self.pos + 4].try_into().unwrap());
        self.pos += 4;
        let tag_id = (word & 0x3FF) as u16;
        let level = ((word >> 10) & 0x3FF) as u16;
        let mut size = (word >> 20) & 0xFFF;
        if size == 0xFFF {
            if self.pos + 4 > self.data.len() {
                return Some(Err(Error::Record("truncated extended size".into())));
            }
            size = u32::from_le_bytes(self.data[self.pos..self.pos + 4].try_into().unwrap());
            self.pos += 4;
        }
        let size = size as usize;
        if self.pos + size > self.data.len() {
            return Some(Err(Error::Record(format!(
                "record body overruns at {} (need {}, have {})",
                self.pos,
                size,
                self.data.len() - self.pos
            ))));
        }
        let payload = &self.data[self.pos..self.pos + size];
        self.pos += size;
        Some(Ok(Record {
            header: RecordHeader {
                tag_id,
                level,
                size: size as u32,
            },
            payload,
        }))
    }
}

#[cfg(test)]
mod tests {
    use super::*;

    fn encode_header(tag: u16, level: u16, size: u32) -> Vec<u8> {
        let word = (tag as u32 & 0x3FF) | ((level as u32 & 0x3FF) << 10) | (size.min(0xFFF) << 20);
        let mut v = word.to_le_bytes().to_vec();
        if size >= 0xFFF {
            v.extend_from_slice(&size.to_le_bytes());
        }
        v
    }

    #[test]
    fn parses_small_record() {
        let mut buf = encode_header(HWPTAG_PARA_TEXT, 1, 4);
        buf.extend_from_slice(&[0xAA, 0xBB, 0xCC, 0xDD]);
        let rec = RecordIter::new(&buf).next().unwrap().unwrap();
        assert_eq!(rec.header.tag_id, HWPTAG_PARA_TEXT);
        assert_eq!(rec.header.level, 1);
        assert_eq!(rec.header.size, 4);
        assert_eq!(rec.payload, &[0xAA, 0xBB, 0xCC, 0xDD]);
    }

    #[test]
    fn parses_extended_size_record() {
        let payload = vec![0x11u8; 5000];
        let mut buf = encode_header(HWPTAG_PARA_TEXT, 0, 5000);
        buf.extend_from_slice(&payload);
        let rec = RecordIter::new(&buf).next().unwrap().unwrap();
        assert_eq!(rec.header.size, 5000);
        assert_eq!(rec.payload.len(), 5000);
    }

    #[test]
    fn iterates_multiple_records() {
        let mut buf = encode_header(HWPTAG_PARA_HEADER, 0, 0);
        buf.extend(encode_header(HWPTAG_PARA_TEXT, 1, 2));
        buf.extend_from_slice(&[0x01, 0x02]);
        let recs: Vec<_> = RecordIter::new(&buf).collect::<Result<_, _>>().unwrap();
        assert_eq!(recs.len(), 2);
        assert_eq!(recs[0].header.tag_id, HWPTAG_PARA_HEADER);
        assert_eq!(recs[1].header.size, 2);
    }

    #[test]
    fn truncated_header_errors() {
        let buf = vec![0u8; 2];
        let err = RecordIter::new(&buf).next().unwrap().unwrap_err();
        assert!(matches!(err, Error::Record(_)));
    }
}