Skip to main content

hedl_xml/
lib.rs

1// Dweve HEDL - Hierarchical Entity Data Language
2//
3// Copyright (c) 2025 Dweve IP B.V. and individual contributors.
4//
5// SPDX-License-Identifier: Apache-2.0
6//
7// Licensed under the Apache License, Version 2.0 (the "License");
8// you may not use this file except in compliance with the License.
9// You may obtain a copy of the License in the LICENSE file at the
10// root of this repository or at: http://www.apache.org/licenses/LICENSE-2.0
11//
12// Unless required by applicable law or agreed to in writing, software
13// distributed under the License is distributed on an "AS IS" BASIS,
14// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15// See the License for the specific language governing permissions and
16// limitations under the License.
17
18//! HEDL XML Conversion
19//!
20//! Provides bidirectional conversion between HEDL documents and XML format.
21//!
22//! # Features
23//!
24//! - Convert HEDL documents to well-formed XML
25//! - Parse XML into HEDL documents with type inference
26//! - **Streaming support** for large multi-gigabyte XML files
27//! - **Async I/O** with Tokio (via `async` feature flag)
28//! - **XSD schema validation** with comprehensive error messages
29//! - **Schema caching** for high-performance validation
30//! - Configurable output formatting (pretty print, attributes)
31//! - Support for nested structures and matrix lists
32//! - Reference and expression preservation
33//!
34//! # Security
35//!
36//! ## XML External Entity (XXE) Prevention
37//!
38//! The hedl-xml crate is **protected against XXE attacks by default** through multiple layers:
39//!
40//! ### Layer 1: Safe Parser (quick-xml)
41//!
42//! The underlying [quick-xml](https://crates.io/crates/quick-xml) library does not:
43//! - Resolve external entities (file://, http://, etc.)
44//! - Process DTD entity declarations
45//! - Expand entity references defined in DOCTYPEs
46//! - Support XInclude directives
47//!
48//! This makes XXE attacks **impossible** regardless of configuration.
49//!
50//! ### Layer 2: Entity Policy Controls
51//!
52//! For defense-in-depth and compliance requirements, explicit entity policies are available:
53//!
54//! ```rust
55//! use hedl_xml::{FromXmlConfig, EntityPolicy};
56//!
57//! // Strictest: Reject any XML with DOCTYPE declarations
58//! let strict_config = FromXmlConfig::strict_security();
59//!
60//! // Default: Allow DOCTYPE but never resolve entities
61//! let default_config = FromXmlConfig::default(); // AllowDtdNoExternal
62//!
63//! // Monitoring: Warn on DTD/entity detection
64//! let warn_config = FromXmlConfig {
65//!     entity_policy: EntityPolicy::WarnOnEntities,
66//!     log_security_events: true,
67//!     ..Default::default()
68//! };
69//! ```
70//!
71//! ### XXE Attack Vectors (Mitigated)
72//!
73//! The following XXE attack patterns are **prevented**:
74//!
75//! - **File Disclosure**: `<!ENTITY xxe SYSTEM "file:///etc/passwd">` - Not expanded
76//! - **Server-Side Request Forgery**: External HTTP entities are not resolved
77//! - **Billion Laughs DoS**: Entity definitions are ignored; no expansion occurs
78//! - **Out-of-Band Exfiltration**: Parameter entities are not resolved or executed
79//!
80//! # Examples
81//!
82//! ## Converting HEDL to XML
83//!
84//! ```rust
85//! use hedl_core::{Document, Item, Value};
86//! use hedl_xml::{to_xml, ToXmlConfig};
87//! use std::collections::BTreeMap;
88//!
89//! let mut doc = Document::new((2, 0));
90//! doc.root.insert("name".to_string(), Item::Scalar(Value::String("example".to_string().into())));
91//!
92//! let config = ToXmlConfig::default();
93//! let xml = to_xml(&doc, &config).unwrap();
94//! ```
95//!
96//! ## Converting XML to HEDL
97//!
98//! ```rust
99//! use hedl_xml::{from_xml, FromXmlConfig};
100//!
101//! let xml = r#"<?xml version="1.0"?><hedl><name>example</name></hedl>"#;
102//! let config = FromXmlConfig::default();
103//! let doc = from_xml(xml, &config).unwrap();
104//! ```
105//!
106//! ## Streaming large XML files
107//!
108//! For multi-gigabyte XML files, use the streaming API to process items incrementally
109//! without loading the entire document into memory:
110//!
111//! ```rust,no_run
112//! use hedl_xml::streaming::{from_xml_stream, StreamConfig};
113//! use std::fs::File;
114//!
115//! let file = File::open("large.xml")?;
116//! let config = StreamConfig::default();
117//!
118//! for result in from_xml_stream(file, &config)? {
119//!     match result {
120//!         Ok(item) => println!("Processing: {}", item.key),
121//!         Err(e) => eprintln!("Error: {}", e),
122//!     }
123//! }
124//! # Ok::<(), Box<dyn std::error::Error>>(())
125//! ```
126//!
127//! ## XSD Schema Validation
128//!
129//! Validate XML documents against XSD schemas:
130//!
131//! ```rust
132//! use hedl_xml::schema::SchemaValidator;
133//!
134//! let schema = r#"<?xml version="1.0"?>
135//! <xs:schema xmlns:xs="http://www.w3.org/2001/XMLSchema">
136//!   <xs:element name="person">
137//!     <xs:complexType>
138//!       <xs:sequence>
139//!         <xs:element name="name" type="xs:string"/>
140//!         <xs:element name="age" type="xs:integer"/>
141//!       </xs:sequence>
142//!     </xs:complexType>
143//!   </xs:element>
144//! </xs:schema>"#;
145//!
146//! let validator = SchemaValidator::from_xsd(schema)?;
147//!
148//! let xml = r#"<?xml version="1.0"?>
149//! <person>
150//!   <name>Alice</name>
151//!   <age>30</age>
152//! </person>"#;
153//!
154//! validator.validate(xml)?;
155//! # Ok::<(), Box<dyn std::error::Error>>(())
156//! ```
157//!
158//! ## Async I/O (with `async` feature)
159//!
160//! Enable async support in `Cargo.toml`:
161//!
162//! ```toml
163//! [dependencies]
164//! hedl-xml = { version = "*", features = ["async"] }
165//! tokio = { version = "1", features = ["full"] }
166//! ```
167//!
168//! Then use async functions:
169//!
170//! ```rust,no_run
171//! # #[cfg(feature = "async")]
172//! # {
173//! use hedl_xml::async_api::{from_xml_file_async, to_xml_file_async};
174//! use hedl_xml::{FromXmlConfig, ToXmlConfig};
175//!
176//! # #[tokio::main]
177//! # async fn main() -> Result<(), Box<dyn std::error::Error>> {
178//! // Read XML asynchronously
179//! let doc = from_xml_file_async("input.xml", &FromXmlConfig::default()).await?;
180//!
181//! // Process document...
182//!
183//! // Write XML asynchronously
184//! to_xml_file_async(&doc, "output.xml", &ToXmlConfig::default()).await?;
185//! # Ok(())
186//! # }
187//! # }
188//! ```
189
190#![cfg_attr(not(test), warn(missing_docs))]
191mod from_xml;
192/// XML schema support.
193pub mod schema;
194/// XML security validation.
195pub mod security;
196/// Streaming XML parsing.
197pub mod streaming;
198mod to_xml;
199
200#[cfg(feature = "async")]
201/// Async XML API.
202pub mod async_api;
203
204pub use from_xml::{from_xml, EntityPolicy, FromXmlConfig};
205pub use schema::{SchemaCache, SchemaValidator, ValidationError};
206pub use security::{SecurityViolation, XmlSecurityValidator};
207pub use streaming::{from_xml_stream, StreamConfig, StreamItem, XmlStreamingParser};
208pub use to_xml::{to_xml, ToXmlConfig};
209
210use hedl_core::Document;
211
212/// Convert HEDL document to XML string with default configuration
213pub fn hedl_to_xml(doc: &Document) -> Result<String, String> {
214    to_xml(doc, &ToXmlConfig::default())
215}
216
217/// Convert XML string to HEDL document with default configuration
218pub fn xml_to_hedl(xml: &str) -> Result<Document, String> {
219    from_xml(xml, &FromXmlConfig::default())
220}
221
222#[cfg(test)]
223mod tests {
224    use super::*;
225    use hedl_core::{Document, Item, MatrixList, Node, Reference, Value};
226    use std::collections::BTreeMap;
227
228    #[test]
229    fn test_round_trip_scalars() {
230        let mut doc = Document::new((2, 0));
231        doc.root
232            .insert("null_val".to_string(), Item::Scalar(Value::Null));
233        doc.root
234            .insert("bool_val".to_string(), Item::Scalar(Value::Bool(true)));
235        doc.root
236            .insert("int_val".to_string(), Item::Scalar(Value::Int(42)));
237        doc.root
238            .insert("float_val".to_string(), Item::Scalar(Value::Float(3.25)));
239        doc.root.insert(
240            "string_val".to_string(),
241            Item::Scalar(Value::String("hello".to_string().into())),
242        );
243
244        let xml = hedl_to_xml(&doc).unwrap();
245        let doc2 = xml_to_hedl(&xml).unwrap();
246
247        assert_eq!(
248            doc2.root.get("bool_val").and_then(|i| i.as_scalar()),
249            Some(&Value::Bool(true))
250        );
251        assert_eq!(
252            doc2.root.get("int_val").and_then(|i| i.as_scalar()),
253            Some(&Value::Int(42))
254        );
255        assert_eq!(
256            doc2.root.get("string_val").and_then(|i| i.as_scalar()),
257            Some(&Value::String("hello".to_string().into()))
258        );
259    }
260
261    #[test]
262    fn test_round_trip_object() {
263        let mut doc = Document::new((2, 0));
264        let mut inner = BTreeMap::new();
265        inner.insert(
266            "name".to_string(),
267            Item::Scalar(Value::String("test".to_string().into())),
268        );
269        inner.insert("value".to_string(), Item::Scalar(Value::Int(100)));
270        doc.root.insert("config".to_string(), Item::Object(inner));
271
272        let xml = hedl_to_xml(&doc).unwrap();
273        let doc2 = xml_to_hedl(&xml).unwrap();
274
275        let config_obj = doc2.root.get("config").and_then(|i| i.as_object()).unwrap();
276        assert_eq!(
277            config_obj.get("name").and_then(|i| i.as_scalar()),
278            Some(&Value::String("test".to_string().into()))
279        );
280        assert_eq!(
281            config_obj.get("value").and_then(|i| i.as_scalar()),
282            Some(&Value::Int(100))
283        );
284    }
285
286    #[test]
287    fn test_round_trip_reference() {
288        let mut doc = Document::new((2, 0));
289        doc.root.insert(
290            "ref1".to_string(),
291            Item::Scalar(Value::Reference(Reference::local("user123"))),
292        );
293        doc.root.insert(
294            "ref2".to_string(),
295            Item::Scalar(Value::Reference(Reference::qualified("User", "456"))),
296        );
297
298        let xml = hedl_to_xml(&doc).unwrap();
299        let doc2 = xml_to_hedl(&xml).unwrap();
300
301        assert_eq!(
302            doc2.root.get("ref1").and_then(|i| i.as_scalar()),
303            Some(&Value::Reference(Reference::local("user123")))
304        );
305        assert_eq!(
306            doc2.root.get("ref2").and_then(|i| i.as_scalar()),
307            Some(&Value::Reference(Reference::qualified("User", "456")))
308        );
309    }
310
311    #[test]
312    fn test_round_trip_expression() {
313        use hedl_core::lex::{ExprLiteral, Expression, Span};
314
315        let mut doc = Document::new((2, 0));
316        let expr = Expression::Call {
317            name: "add".to_string(),
318            args: vec![
319                Expression::Identifier {
320                    name: "x".to_string(),
321                    span: Span::synthetic(),
322                },
323                Expression::Literal {
324                    value: ExprLiteral::Int(1),
325                    span: Span::synthetic(),
326                },
327            ],
328            span: Span::synthetic(),
329        };
330        doc.root.insert(
331            "expr".to_string(),
332            Item::Scalar(Value::Expression(Box::new(expr.clone()))),
333        );
334
335        let xml = hedl_to_xml(&doc).unwrap();
336        let doc2 = xml_to_hedl(&xml).unwrap();
337
338        // Check expression is preserved (span info is lost during XML round-trip)
339        if let Some(Item::Scalar(Value::Expression(e))) = doc2.root.get("expr") {
340            // Compare string representation which ignores spans
341            assert_eq!(e.to_string(), expr.to_string());
342        } else {
343            panic!("Expected expression value");
344        }
345    }
346
347    #[test]
348    fn test_matrix_list() {
349        let mut doc = Document::new((2, 0));
350        let mut list = MatrixList::new("User", vec!["id".to_string(), "name".to_string()]);
351
352        let node1 = Node::new(
353            "User",
354            "user1",
355            vec![
356                Value::String("user1".to_string().into()),
357                Value::String("Alice".to_string().into()),
358            ],
359        );
360        let node2 = Node::new(
361            "User",
362            "user2",
363            vec![
364                Value::String("user2".to_string().into()),
365                Value::String("Bob".to_string().into()),
366            ],
367        );
368
369        list.add_row(node1);
370        list.add_row(node2);
371
372        doc.root.insert("users".to_string(), Item::List(list));
373
374        let xml = hedl_to_xml(&doc).unwrap();
375        assert!(xml.contains("<users"));
376        assert!(xml.contains("user1"));
377        assert!(xml.contains("user2"));
378    }
379
380    #[test]
381    fn test_special_characters_escaping() {
382        let mut doc = Document::new((2, 0));
383        doc.root.insert(
384            "text".to_string(),
385            Item::Scalar(Value::String(
386                "hello & goodbye <tag> \"quoted\"".to_string().into(),
387            )),
388        );
389
390        let xml = hedl_to_xml(&doc).unwrap();
391        let doc2 = xml_to_hedl(&xml).unwrap();
392
393        // XML escaping should be handled transparently
394        let original = doc.root.get("text").and_then(|i| i.as_scalar());
395        let parsed = doc2.root.get("text").and_then(|i| i.as_scalar());
396
397        assert_eq!(original, parsed);
398    }
399
400    #[test]
401    fn test_nested_objects() {
402        let mut doc = Document::new((2, 0));
403
404        let mut level2 = BTreeMap::new();
405        level2.insert(
406            "deep".to_string(),
407            Item::Scalar(Value::String("value".to_string().into())),
408        );
409
410        let mut level1 = BTreeMap::new();
411        level1.insert("nested".to_string(), Item::Object(level2));
412
413        doc.root.insert("outer".to_string(), Item::Object(level1));
414
415        let xml = hedl_to_xml(&doc).unwrap();
416        let doc2 = xml_to_hedl(&xml).unwrap();
417
418        assert!(doc2.root.contains_key("outer"));
419    }
420
421    #[test]
422    fn test_config_pretty_print() {
423        let mut doc = Document::new((2, 0));
424        doc.root.insert(
425            "test".to_string(),
426            Item::Scalar(Value::String("value".to_string().into())),
427        );
428
429        let config_pretty = ToXmlConfig {
430            pretty: true,
431            indent: "  ".to_string(),
432            ..Default::default()
433        };
434
435        let config_compact = ToXmlConfig {
436            pretty: false,
437            ..Default::default()
438        };
439
440        let xml_pretty = to_xml(&doc, &config_pretty).unwrap();
441        let xml_compact = to_xml(&doc, &config_compact).unwrap();
442
443        // Pretty printed should have newlines and indentation
444        assert!(xml_pretty.len() > xml_compact.len());
445    }
446
447    #[test]
448    fn test_config_custom_root() {
449        let doc = Document::new((2, 0));
450
451        let config = ToXmlConfig {
452            root_element: "custom_root".to_string(),
453            ..Default::default()
454        };
455
456        let xml = to_xml(&doc, &config).unwrap();
457        assert!(xml.contains("<custom_root"));
458        assert!(xml.contains("</custom_root>"));
459    }
460
461    #[test]
462    fn test_config_metadata() {
463        let doc = Document::new((2, 1));
464
465        let config = ToXmlConfig {
466            include_metadata: true,
467            ..Default::default()
468        };
469
470        let xml = to_xml(&doc, &config).unwrap();
471        assert!(xml.contains("version=\"2.1\""));
472    }
473
474    #[test]
475    fn test_empty_values() {
476        let mut doc = Document::new((2, 0));
477        doc.root
478            .insert("empty".to_string(), Item::Scalar(Value::Null));
479
480        let xml = hedl_to_xml(&doc).unwrap();
481        let doc2 = xml_to_hedl(&xml).unwrap();
482
483        assert!(doc2.root.contains_key("empty"));
484    }
485
486    #[test]
487    fn test_tensor_values() {
488        use hedl_core::lex::Tensor;
489
490        let mut doc = Document::new((2, 0));
491        let tensor = Tensor::Array(vec![
492            Tensor::Scalar(1.0),
493            Tensor::Scalar(2.0),
494            Tensor::Scalar(3.0),
495        ]);
496        doc.root.insert(
497            "tensor".to_string(),
498            Item::Scalar(Value::Tensor(Box::new(tensor))),
499        );
500
501        let xml = hedl_to_xml(&doc).unwrap();
502        assert!(xml.contains("<tensor>"));
503        assert!(xml.contains("<item>"));
504    }
505
506    #[test]
507    fn test_infer_lists_config() {
508        let xml = r#"<?xml version="1.0" encoding="UTF-8"?>
509        <hedl>
510            <user id="1"><name>Alice</name></user>
511            <user id="2"><name>Bob</name></user>
512        </hedl>"#;
513
514        let config = FromXmlConfig {
515            infer_lists: true,
516            ..Default::default()
517        };
518
519        let doc = from_xml(xml, &config).unwrap();
520
521        // Should infer a list from repeated <user> elements
522        assert!(doc.root.contains_key("user"));
523        if let Some(Item::List(list)) = doc.root.get("user") {
524            assert_eq!(list.rows.len(), 2);
525        }
526    }
527
528    #[test]
529    fn test_attributes_as_values() {
530        let xml = r#"<?xml version="1.0" encoding="UTF-8"?>
531        <hedl>
532            <item id="123" name="test" active="true"/>
533        </hedl>"#;
534
535        let config = FromXmlConfig::default();
536        let doc = from_xml(xml, &config).unwrap();
537
538        assert!(doc.root.contains_key("item"));
539        if let Some(Item::Object(obj)) = doc.root.get("item") {
540            // "123" is inferred as an integer (type inference is correct)
541            assert_eq!(
542                obj.get("id").and_then(|i| i.as_scalar()),
543                Some(&Value::Int(123))
544            );
545            assert_eq!(
546                obj.get("name").and_then(|i| i.as_scalar()),
547                Some(&Value::String("test".to_string().into()))
548            );
549            assert_eq!(
550                obj.get("active").and_then(|i| i.as_scalar()),
551                Some(&Value::Bool(true))
552            );
553        }
554    }
555}