jw-hwp-core 0.1.2

Read-only parser for Hancom HWP 5.0 (binary CFB) and HWPX (OWPML) documents
Documentation
//! Per-paragraph classification: text vs. container control (table, image, ...).

use serde::Serialize;

#[derive(Debug, Clone, PartialEq, Eq, Serialize)]
#[serde(rename_all = "snake_case")]
pub enum NodeKind {
    Paragraph,
    Table,
    Image,
    Equation,
    Header,
    Footer,
    Footnote,
    Endnote,
    SectionDefinition,
    ColumnDefinition,
    Field,
    Unsupported,
}

#[derive(Debug, Clone, Serialize, PartialEq, Eq)]
pub struct StructureNode {
    /// Stable ID shaped like `"{section}:{paragraph}"`.
    pub id: String,
    pub kind: NodeKind,
    pub preview: String,
    /// Raw ASCII FourCC of the first control header in the paragraph, if any.
    pub ctrl_id: Option<String>,
}

/// Map an HWP control-header ASCII FourCC to a NodeKind.
///
/// Input is 4 ASCII bytes in file order (e.g. `[b't', b'b', b'l', b' ']`).
pub fn classify(ctrl: [u8; 4]) -> NodeKind {
    match &ctrl {
        b"tbl " => NodeKind::Table,
        b"gso " => NodeKind::Image,
        b"eqed" => NodeKind::Equation,
        b"hedd" => NodeKind::Header,
        b"foot" => NodeKind::Footer,
        b"%fn " => NodeKind::Footnote,
        b"%en " => NodeKind::Endnote,
        b"secd" => NodeKind::SectionDefinition,
        b"cold" => NodeKind::ColumnDefinition,
        c if c[0] == b'%' => NodeKind::Field,
        _ => NodeKind::Unsupported,
    }
}

/// Cheap preview: first 60 Unicode scalar values of text, newlines replaced with spaces.
pub fn make_preview(text: &str) -> String {
    let flat: String = text
        .chars()
        .map(|c| if c == '\n' { ' ' } else { c })
        .collect();
    flat.chars().take(60).collect()
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn classifies_known_ctrl_ids() {
        assert_eq!(classify(*b"tbl "), NodeKind::Table);
        assert_eq!(classify(*b"gso "), NodeKind::Image);
        assert_eq!(classify(*b"secd"), NodeKind::SectionDefinition);
        assert_eq!(classify(*b"%fn "), NodeKind::Footnote);
        assert_eq!(classify(*b"%xy "), NodeKind::Field);
        assert_eq!(classify(*b"xxxx"), NodeKind::Unsupported);
    }

    #[test]
    fn preview_trims_to_60_and_flattens_newlines() {
        let long = "a\nb".repeat(100);
        let p = make_preview(&long);
        assert_eq!(p.chars().count(), 60);
        assert!(!p.contains('\n'));
    }
}