Skip to main content

panproto_protocols/web_document/
docx.rs

1//! DOCX/Office Open XML protocol definition.
2//!
3//! Uses Group E theory: constrained multigraph + W-type + metadata.
4
5use std::collections::HashMap;
6use std::hash::BuildHasher;
7
8use panproto_gat::Theory;
9use panproto_schema::{EdgeRule, Protocol, Schema, SchemaBuilder};
10
11use crate::emit::{children_by_edge, find_roots, vertex_constraints};
12use crate::error::ProtocolError;
13use crate::theories;
14
15/// Returns the DOCX protocol definition.
16#[must_use]
17pub fn protocol() -> Protocol {
18    Protocol {
19        name: "docx".into(),
20        schema_theory: "ThDocxSchema".into(),
21        instance_theory: "ThDocxInstance".into(),
22        edge_rules: edge_rules(),
23        obj_kinds: vec![
24            "document".into(),
25            "body".into(),
26            "paragraph".into(),
27            "run".into(),
28            "text".into(),
29            "table".into(),
30            "row".into(),
31            "cell".into(),
32            "section".into(),
33            "header".into(),
34            "footer".into(),
35            "style".into(),
36            "numbering".into(),
37            "footnote".into(),
38            "image".into(),
39            "hyperlink".into(),
40        ],
41        constraint_sorts: vec![
42            "required".into(),
43            "style-type".into(),
44            "numbering-format".into(),
45        ],
46        has_order: true,
47        nominal_identity: true,
48        ..Protocol::default()
49    }
50}
51
52/// Register the component GATs for DOCX.
53pub fn register_theories<S: BuildHasher>(registry: &mut HashMap<String, Theory, S>) {
54    theories::register_multigraph_wtype_meta(registry, "ThDocxSchema", "ThDocxInstance");
55}
56
57/// Parse a JSON-based DOCX content model into a [`Schema`].
58///
59/// # Errors
60///
61/// Returns [`ProtocolError`] if parsing fails.
62pub fn parse_docx_schema(json: &serde_json::Value) -> Result<Schema, ProtocolError> {
63    let proto = protocol();
64    let mut builder = SchemaBuilder::new(&proto);
65
66    let elements = json
67        .get("elements")
68        .and_then(serde_json::Value::as_object)
69        .ok_or_else(|| ProtocolError::MissingField("elements".into()))?;
70
71    for (name, def) in elements {
72        let kind = def
73            .get("kind")
74            .and_then(serde_json::Value::as_str)
75            .unwrap_or("document");
76        builder = builder.vertex(name, kind, None)?;
77        // Every top-level DOCX element is a candidate entry basepoint.
78        if kind == "document" {
79            builder = builder.entry(name);
80        }
81
82        for field in &["required", "style-type", "numbering-format"] {
83            if let Some(val) = def.get(field).and_then(serde_json::Value::as_str) {
84                builder = builder.constraint(name, field, val);
85            }
86        }
87
88        if let Some(children) = def.get("children").and_then(serde_json::Value::as_object) {
89            for (child_name, child_def) in children {
90                let child_id = format!("{name}.{child_name}");
91                let child_kind = child_def
92                    .get("kind")
93                    .and_then(serde_json::Value::as_str)
94                    .unwrap_or("text");
95                builder = builder.vertex(&child_id, child_kind, None)?;
96                builder = builder.edge(name, &child_id, "prop", Some(child_name))?;
97
98                for field in &["required", "style-type"] {
99                    if let Some(val) = child_def.get(field).and_then(serde_json::Value::as_str) {
100                        builder = builder.constraint(&child_id, field, val);
101                    }
102                }
103            }
104        }
105
106        if let Some(items) = def.get("items").and_then(serde_json::Value::as_array) {
107            for (i, item) in items.iter().enumerate() {
108                if let Some(item_kind) = item.as_str() {
109                    let item_id = format!("{name}:item{i}");
110                    builder = builder.vertex(&item_id, item_kind, None)?;
111                    builder = builder.edge(name, &item_id, "items", Some(item_kind))?;
112                }
113            }
114        }
115    }
116
117    let schema = builder.build()?;
118    Ok(schema)
119}
120
121/// Emit a [`Schema`] as a JSON DOCX schema.
122///
123/// # Errors
124///
125/// Returns [`ProtocolError`] if emission fails.
126pub fn emit_docx_schema(schema: &Schema) -> Result<serde_json::Value, ProtocolError> {
127    let structural = &["prop", "items"];
128    let roots = find_roots(schema, structural);
129
130    let mut elements = serde_json::Map::new();
131    for root in &roots {
132        let mut obj = serde_json::Map::new();
133        obj.insert("kind".into(), serde_json::json!(root.kind));
134
135        for c in vertex_constraints(schema, &root.id) {
136            obj.insert(c.sort.to_string(), serde_json::json!(c.value));
137        }
138
139        let props = children_by_edge(schema, &root.id, "prop");
140        if !props.is_empty() {
141            let mut children = serde_json::Map::new();
142            for (edge, child) in &props {
143                let child_name = edge.name.as_deref().unwrap_or(&child.id);
144                let mut child_obj = serde_json::Map::new();
145                child_obj.insert("kind".into(), serde_json::json!(child.kind));
146                for c in vertex_constraints(schema, &child.id) {
147                    child_obj.insert(c.sort.to_string(), serde_json::json!(c.value));
148                }
149                children.insert(child_name.to_string(), serde_json::Value::Object(child_obj));
150            }
151            obj.insert("children".into(), serde_json::Value::Object(children));
152        }
153
154        let items = children_by_edge(schema, &root.id, "items");
155        if !items.is_empty() {
156            let arr: Vec<serde_json::Value> = items
157                .iter()
158                .filter_map(|(e, _)| e.name.as_deref().map(|n| serde_json::json!(n)))
159                .collect();
160            obj.insert("items".into(), serde_json::Value::Array(arr));
161        }
162
163        elements.insert(root.id.to_string(), serde_json::Value::Object(obj));
164    }
165
166    Ok(serde_json::json!({ "elements": elements }))
167}
168
169fn edge_rules() -> Vec<EdgeRule> {
170    vec![
171        EdgeRule {
172            edge_kind: "prop".into(),
173            src_kinds: vec![
174                "document".into(),
175                "body".into(),
176                "paragraph".into(),
177                "run".into(),
178                "table".into(),
179                "row".into(),
180                "cell".into(),
181                "section".into(),
182            ],
183            tgt_kinds: vec![],
184        },
185        EdgeRule {
186            edge_kind: "items".into(),
187            src_kinds: vec![
188                "document".into(),
189                "body".into(),
190                "paragraph".into(),
191                "table".into(),
192            ],
193            tgt_kinds: vec![],
194        },
195    ]
196}
197
198#[cfg(test)]
199#[allow(clippy::expect_used, clippy::unwrap_used)]
200mod tests {
201    use super::*;
202
203    #[test]
204    fn protocol_def() {
205        let p = protocol();
206        assert_eq!(p.name, "docx");
207    }
208
209    #[test]
210    fn register_theories_works() {
211        let mut registry = HashMap::new();
212        register_theories(&mut registry);
213        assert!(registry.contains_key("ThDocxSchema"));
214        assert!(registry.contains_key("ThDocxInstance"));
215    }
216
217    #[test]
218    fn parse_and_emit() {
219        let json = serde_json::json!({
220            "elements": {
221                "document": {
222                    "kind": "document",
223                    "children": {
224                        "body": {"kind": "body"}
225                    },
226                    "items": ["paragraph", "table"]
227                }
228            }
229        });
230        let schema = parse_docx_schema(&json).expect("should parse");
231        assert!(schema.has_vertex("document"));
232        assert!(schema.has_vertex("document.body"));
233
234        let emitted = emit_docx_schema(&schema).expect("should emit");
235        let s2 = parse_docx_schema(&emitted).expect("re-parse");
236        assert_eq!(schema.vertex_count(), s2.vertex_count());
237    }
238}