jw-hwp-core 0.1.2

Read-only parser for Hancom HWP 5.0 (binary CFB) and HWPX (OWPML) documents
Documentation
use crate::doc_info::DocumentProperties;
use crate::error::Warning;
use crate::structure::StructureNode;
use crate::summary::Metadata;

#[derive(Debug, Clone, serde::Serialize, PartialEq)]
pub struct HwpDocument {
    pub version: String,
    pub metadata: Metadata,
    pub properties: DocumentProperties,
    pub shapes: crate::shape::ShapeTables,
    pub sections: Vec<Section>,
    #[serde(default)]
    pub assets: crate::assets::AssetCatalog,
    pub warnings: Vec<Warning>,
}

#[derive(Debug, Clone, serde::Serialize, PartialEq, Eq)]
pub struct ParagraphDetail {
    pub text: String,
    pub para_shape_id: u32,
    /// Sorted list of `(char_index_in_text, char_shape_id)` boundaries.
    pub runs: Vec<(u32, u32)>,
    #[serde(default, skip_serializing_if = "Vec::is_empty")]
    pub footnotes: Vec<FootnoteBody>,
    #[serde(default, skip_serializing_if = "Option::is_none")]
    pub equation: Option<String>,
    #[serde(default, skip_serializing_if = "Vec::is_empty")]
    pub image_refs: Vec<ImageRef>,
}

#[derive(Debug, Clone, serde::Serialize, PartialEq, Eq)]
pub struct FootnoteBody {
    /// `"footnote"` | `"endnote"`.
    pub kind: String,
    pub text: String,
}

#[derive(Debug, Clone, serde::Serialize, PartialEq, Eq)]
pub struct ImageRef {
    pub bin_id: u16,
}

#[derive(Debug, Clone, serde::Serialize, PartialEq, Eq)]
pub struct Section {
    pub index: usize,
    pub paragraphs: Vec<String>,
    pub paragraph_details: Vec<ParagraphDetail>,
    pub structure: Vec<StructureNode>,
    pub tables: Vec<crate::table::Table>,
}

impl HwpDocument {
    pub fn full_text(&self) -> String {
        self.sections
            .iter()
            .flat_map(|s| s.paragraphs.iter())
            .cloned()
            .collect::<Vec<_>>()
            .join("\n")
    }
}

#[cfg(test)]
mod tests {
    use super::*;
    use crate::structure::NodeKind;

    #[test]
    fn full_text_joins_paragraphs_with_newlines() {
        let d = HwpDocument {
            version: "5.0.3.0".into(),
            metadata: Metadata::default(),
            properties: DocumentProperties::default(),
            shapes: Default::default(),
            sections: vec![
                Section {
                    index: 0,
                    paragraphs: vec!["a".into(), "b".into()],
                    paragraph_details: vec![],
                    structure: vec![],
                    tables: vec![],
                },
                Section {
                    index: 1,
                    paragraphs: vec!["c".into()],
                    paragraph_details: vec![],
                    structure: vec![],
                    tables: vec![],
                },
            ],
            assets: Default::default(),
            warnings: vec![],
        };
        assert_eq!(d.full_text(), "a\nb\nc");
    }

    #[test]
    fn structure_node_defaults_to_paragraph() {
        let node = StructureNode {
            id: "0:0".into(),
            kind: NodeKind::Paragraph,
            preview: "hello".into(),
            ctrl_id: None,
        };
        assert_eq!(node.kind, NodeKind::Paragraph);
    }
}