hedl_xml/
lib.rs

1// Dweve HEDL - Hierarchical Entity Data Language
2//
3// Copyright (c) 2025 Dweve IP B.V. and individual contributors.
4//
5// SPDX-License-Identifier: Apache-2.0
6//
7// Licensed under the Apache License, Version 2.0 (the "License");
8// you may not use this file except in compliance with the License.
9// You may obtain a copy of the License in the LICENSE file at the
10// root of this repository or at: http://www.apache.org/licenses/LICENSE-2.0
11//
12// Unless required by applicable law or agreed to in writing, software
13// distributed under the License is distributed on an "AS IS" BASIS,
14// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15// See the License for the specific language governing permissions and
16// limitations under the License.
17
18//! HEDL XML Conversion
19//!
20//! Provides bidirectional conversion between HEDL documents and XML format.
21//!
22//! # Features
23//!
24//! - Convert HEDL documents to well-formed XML
25//! - Parse XML into HEDL documents with type inference
26//! - **Streaming support** for large multi-gigabyte XML files
27//! - **Async I/O** with Tokio (via `async` feature flag)
28//! - **XSD schema validation** with comprehensive error messages
29//! - **Schema caching** for high-performance validation
30//! - Configurable output formatting (pretty print, attributes)
31//! - Support for nested structures and matrix lists
32//! - Reference and expression preservation
33//!
34//! # Security
35//!
36//! ## XML External Entity (XXE) Prevention
37//!
38//! The hedl-xml crate is **protected against XXE attacks by default** through multiple layers:
39//!
40//! ### Layer 1: Safe Parser (quick-xml)
41//!
42//! The underlying [quick-xml](https://crates.io/crates/quick-xml) library does not:
43//! - Resolve external entities (file://, http://, etc.)
44//! - Process DTD entity declarations
45//! - Expand entity references defined in DOCTYPEs
46//! - Support XInclude directives
47//!
48//! This makes XXE attacks **impossible** regardless of configuration.
49//!
50//! ### Layer 2: Entity Policy Controls
51//!
52//! For defense-in-depth and compliance requirements, explicit entity policies are available:
53//!
54//! ```rust
55//! use hedl_xml::{FromXmlConfig, EntityPolicy};
56//!
57//! // Strictest: Reject any XML with DOCTYPE declarations
58//! let strict_config = FromXmlConfig::strict_security();
59//!
60//! // Default: Allow DOCTYPE but never resolve entities
61//! let default_config = FromXmlConfig::default(); // AllowDtdNoExternal
62//!
63//! // Monitoring: Warn on DTD/entity detection
64//! let warn_config = FromXmlConfig {
65//!     entity_policy: EntityPolicy::WarnOnEntities,
66//!     log_security_events: true,
67//!     ..Default::default()
68//! };
69//! ```
70//!
71//! ### XXE Attack Vectors (Mitigated)
72//!
73//! The following XXE attack patterns are **prevented**:
74//!
75//! - **File Disclosure**: `<!ENTITY xxe SYSTEM "file:///etc/passwd">` - Not expanded
76//! - **Server-Side Request Forgery**: External HTTP entities are not resolved
77//! - **Billion Laughs DoS**: Entity definitions are ignored; no expansion occurs
78//! - **Out-of-Band Exfiltration**: Parameter entities are not resolved or executed
79//!
80//! # Examples
81//!
82//! ## Converting HEDL to XML
83//!
84//! ```rust
85//! use hedl_core::{Document, Item, Value};
86//! use hedl_xml::{to_xml, ToXmlConfig};
87//! use std::collections::BTreeMap;
88//!
89//! let mut doc = Document::new((1, 0));
90//! doc.root.insert("name".to_string(), Item::Scalar(Value::String("example".to_string().into())));
91//!
92//! let config = ToXmlConfig::default();
93//! let xml = to_xml(&doc, &config).unwrap();
94//! ```
95//!
96//! ## Converting XML to HEDL
97//!
98//! ```rust
99//! use hedl_xml::{from_xml, FromXmlConfig};
100//!
101//! let xml = r#"<?xml version="1.0"?><hedl><name>example</name></hedl>"#;
102//! let config = FromXmlConfig::default();
103//! let doc = from_xml(xml, &config).unwrap();
104//! ```
105//!
106//! ## Streaming large XML files
107//!
108//! For multi-gigabyte XML files, use the streaming API to process items incrementally
109//! without loading the entire document into memory:
110//!
111//! ```rust,no_run
112//! use hedl_xml::streaming::{from_xml_stream, StreamConfig};
113//! use std::fs::File;
114//!
115//! let file = File::open("large.xml")?;
116//! let config = StreamConfig::default();
117//!
118//! for result in from_xml_stream(file, &config)? {
119//!     match result {
120//!         Ok(item) => println!("Processing: {}", item.key),
121//!         Err(e) => eprintln!("Error: {}", e),
122//!     }
123//! }
124//! # Ok::<(), Box<dyn std::error::Error>>(())
125//! ```
126//!
127//! ## XSD Schema Validation
128//!
129//! Validate XML documents against XSD schemas:
130//!
131//! ```rust
132//! use hedl_xml::schema::SchemaValidator;
133//!
134//! let schema = r#"<?xml version="1.0"?>
135//! <xs:schema xmlns:xs="http://www.w3.org/2001/XMLSchema">
136//!   <xs:element name="person">
137//!     <xs:complexType>
138//!       <xs:sequence>
139//!         <xs:element name="name" type="xs:string"/>
140//!         <xs:element name="age" type="xs:integer"/>
141//!       </xs:sequence>
142//!     </xs:complexType>
143//!   </xs:element>
144//! </xs:schema>"#;
145//!
146//! let validator = SchemaValidator::from_xsd(schema)?;
147//!
148//! let xml = r#"<?xml version="1.0"?>
149//! <person>
150//!   <name>Alice</name>
151//!   <age>30</age>
152//! </person>"#;
153//!
154//! validator.validate(xml)?;
155//! # Ok::<(), Box<dyn std::error::Error>>(())
156//! ```
157//!
158//! ## Async I/O (with `async` feature)
159//!
160//! Enable async support in `Cargo.toml`:
161//!
162//! ```toml
163//! [dependencies]
164//! hedl-xml = { version = "*", features = ["async"] }
165//! tokio = { version = "1", features = ["full"] }
166//! ```
167//!
168//! Then use async functions:
169//!
170//! ```rust,no_run
171//! # #[cfg(feature = "async")]
172//! # {
173//! use hedl_xml::async_api::{from_xml_file_async, to_xml_file_async};
174//! use hedl_xml::{FromXmlConfig, ToXmlConfig};
175//!
176//! # #[tokio::main]
177//! # async fn main() -> Result<(), Box<dyn std::error::Error>> {
178//! // Read XML asynchronously
179//! let doc = from_xml_file_async("input.xml", &FromXmlConfig::default()).await?;
180//!
181//! // Process document...
182//!
183//! // Write XML asynchronously
184//! to_xml_file_async(&doc, "output.xml", &ToXmlConfig::default()).await?;
185//! # Ok(())
186//! # }
187//! # }
188//! ```
189
190#![cfg_attr(not(test), warn(missing_docs))]
191mod from_xml;
192pub mod schema;
193pub mod security;
194pub mod streaming;
195mod to_xml;
196
197#[cfg(feature = "async")]
198pub mod async_api;
199
200pub use from_xml::{from_xml, EntityPolicy, FromXmlConfig};
201pub use schema::{SchemaCache, SchemaValidator, ValidationError};
202pub use security::{SecurityViolation, XmlSecurityValidator};
203pub use streaming::{from_xml_stream, StreamConfig, StreamItem, XmlStreamingParser};
204pub use to_xml::{to_xml, ToXmlConfig};
205
206use hedl_core::Document;
207
208/// Convert HEDL document to XML string with default configuration
209pub fn hedl_to_xml(doc: &Document) -> Result<String, String> {
210    to_xml(doc, &ToXmlConfig::default())
211}
212
213/// Convert XML string to HEDL document with default configuration
214pub fn xml_to_hedl(xml: &str) -> Result<Document, String> {
215    from_xml(xml, &FromXmlConfig::default())
216}
217
218#[cfg(test)]
219mod tests {
220    use super::*;
221    use hedl_core::{Document, Item, MatrixList, Node, Reference, Value};
222    use std::collections::BTreeMap;
223
224    #[test]
225    fn test_round_trip_scalars() {
226        let mut doc = Document::new((1, 0));
227        doc.root
228            .insert("null_val".to_string(), Item::Scalar(Value::Null));
229        doc.root
230            .insert("bool_val".to_string(), Item::Scalar(Value::Bool(true)));
231        doc.root
232            .insert("int_val".to_string(), Item::Scalar(Value::Int(42)));
233        doc.root
234            .insert("float_val".to_string(), Item::Scalar(Value::Float(3.25)));
235        doc.root.insert(
236            "string_val".to_string(),
237            Item::Scalar(Value::String("hello".to_string().into())),
238        );
239
240        let xml = hedl_to_xml(&doc).unwrap();
241        let doc2 = xml_to_hedl(&xml).unwrap();
242
243        assert_eq!(
244            doc2.root.get("bool_val").and_then(|i| i.as_scalar()),
245            Some(&Value::Bool(true))
246        );
247        assert_eq!(
248            doc2.root.get("int_val").and_then(|i| i.as_scalar()),
249            Some(&Value::Int(42))
250        );
251        assert_eq!(
252            doc2.root.get("string_val").and_then(|i| i.as_scalar()),
253            Some(&Value::String("hello".to_string().into()))
254        );
255    }
256
257    #[test]
258    fn test_round_trip_object() {
259        let mut doc = Document::new((1, 0));
260        let mut inner = BTreeMap::new();
261        inner.insert(
262            "name".to_string(),
263            Item::Scalar(Value::String("test".to_string().into())),
264        );
265        inner.insert("value".to_string(), Item::Scalar(Value::Int(100)));
266        doc.root.insert("config".to_string(), Item::Object(inner));
267
268        let xml = hedl_to_xml(&doc).unwrap();
269        let doc2 = xml_to_hedl(&xml).unwrap();
270
271        let config_obj = doc2.root.get("config").and_then(|i| i.as_object()).unwrap();
272        assert_eq!(
273            config_obj.get("name").and_then(|i| i.as_scalar()),
274            Some(&Value::String("test".to_string().into()))
275        );
276        assert_eq!(
277            config_obj.get("value").and_then(|i| i.as_scalar()),
278            Some(&Value::Int(100))
279        );
280    }
281
282    #[test]
283    fn test_round_trip_reference() {
284        let mut doc = Document::new((1, 0));
285        doc.root.insert(
286            "ref1".to_string(),
287            Item::Scalar(Value::Reference(Reference::local("user123"))),
288        );
289        doc.root.insert(
290            "ref2".to_string(),
291            Item::Scalar(Value::Reference(Reference::qualified("User", "456"))),
292        );
293
294        let xml = hedl_to_xml(&doc).unwrap();
295        let doc2 = xml_to_hedl(&xml).unwrap();
296
297        assert_eq!(
298            doc2.root.get("ref1").and_then(|i| i.as_scalar()),
299            Some(&Value::Reference(Reference::local("user123")))
300        );
301        assert_eq!(
302            doc2.root.get("ref2").and_then(|i| i.as_scalar()),
303            Some(&Value::Reference(Reference::qualified("User", "456")))
304        );
305    }
306
307    #[test]
308    fn test_round_trip_expression() {
309        use hedl_core::lex::{ExprLiteral, Expression, Span};
310
311        let mut doc = Document::new((1, 0));
312        let expr = Expression::Call {
313            name: "add".to_string(),
314            args: vec![
315                Expression::Identifier {
316                    name: "x".to_string(),
317                    span: Span::synthetic(),
318                },
319                Expression::Literal {
320                    value: ExprLiteral::Int(1),
321                    span: Span::synthetic(),
322                },
323            ],
324            span: Span::synthetic(),
325        };
326        doc.root.insert(
327            "expr".to_string(),
328            Item::Scalar(Value::Expression(Box::new(expr.clone()))),
329        );
330
331        let xml = hedl_to_xml(&doc).unwrap();
332        let doc2 = xml_to_hedl(&xml).unwrap();
333
334        // Check expression is preserved (span info is lost during XML round-trip)
335        if let Some(Item::Scalar(Value::Expression(e))) = doc2.root.get("expr") {
336            // Compare string representation which ignores spans
337            assert_eq!(e.to_string(), expr.to_string());
338        } else {
339            panic!("Expected expression value");
340        }
341    }
342
343    #[test]
344    fn test_matrix_list() {
345        let mut doc = Document::new((1, 0));
346        let mut list = MatrixList::new("User", vec!["id".to_string(), "name".to_string()]);
347
348        let node1 = Node::new(
349            "User",
350            "user1",
351            vec![
352                Value::String("user1".to_string().into()),
353                Value::String("Alice".to_string().into()),
354            ],
355        );
356        let node2 = Node::new(
357            "User",
358            "user2",
359            vec![
360                Value::String("user2".to_string().into()),
361                Value::String("Bob".to_string().into()),
362            ],
363        );
364
365        list.add_row(node1);
366        list.add_row(node2);
367
368        doc.root.insert("users".to_string(), Item::List(list));
369
370        let xml = hedl_to_xml(&doc).unwrap();
371        assert!(xml.contains("<users"));
372        assert!(xml.contains("user1"));
373        assert!(xml.contains("user2"));
374    }
375
376    #[test]
377    fn test_special_characters_escaping() {
378        let mut doc = Document::new((1, 0));
379        doc.root.insert(
380            "text".to_string(),
381            Item::Scalar(Value::String(
382                "hello & goodbye <tag> \"quoted\"".to_string().into(),
383            )),
384        );
385
386        let xml = hedl_to_xml(&doc).unwrap();
387        let doc2 = xml_to_hedl(&xml).unwrap();
388
389        // XML escaping should be handled transparently
390        let original = doc.root.get("text").and_then(|i| i.as_scalar());
391        let parsed = doc2.root.get("text").and_then(|i| i.as_scalar());
392
393        assert_eq!(original, parsed);
394    }
395
396    #[test]
397    fn test_nested_objects() {
398        let mut doc = Document::new((1, 0));
399
400        let mut level2 = BTreeMap::new();
401        level2.insert(
402            "deep".to_string(),
403            Item::Scalar(Value::String("value".to_string().into())),
404        );
405
406        let mut level1 = BTreeMap::new();
407        level1.insert("nested".to_string(), Item::Object(level2));
408
409        doc.root.insert("outer".to_string(), Item::Object(level1));
410
411        let xml = hedl_to_xml(&doc).unwrap();
412        let doc2 = xml_to_hedl(&xml).unwrap();
413
414        assert!(doc2.root.contains_key("outer"));
415    }
416
417    #[test]
418    fn test_config_pretty_print() {
419        let mut doc = Document::new((1, 0));
420        doc.root.insert(
421            "test".to_string(),
422            Item::Scalar(Value::String("value".to_string().into())),
423        );
424
425        let config_pretty = ToXmlConfig {
426            pretty: true,
427            indent: "  ".to_string(),
428            ..Default::default()
429        };
430
431        let config_compact = ToXmlConfig {
432            pretty: false,
433            ..Default::default()
434        };
435
436        let xml_pretty = to_xml(&doc, &config_pretty).unwrap();
437        let xml_compact = to_xml(&doc, &config_compact).unwrap();
438
439        // Pretty printed should have newlines and indentation
440        assert!(xml_pretty.len() > xml_compact.len());
441    }
442
443    #[test]
444    fn test_config_custom_root() {
445        let doc = Document::new((1, 0));
446
447        let config = ToXmlConfig {
448            root_element: "custom_root".to_string(),
449            ..Default::default()
450        };
451
452        let xml = to_xml(&doc, &config).unwrap();
453        assert!(xml.contains("<custom_root"));
454        assert!(xml.contains("</custom_root>"));
455    }
456
457    #[test]
458    fn test_config_metadata() {
459        let doc = Document::new((2, 1));
460
461        let config = ToXmlConfig {
462            include_metadata: true,
463            ..Default::default()
464        };
465
466        let xml = to_xml(&doc, &config).unwrap();
467        assert!(xml.contains("version=\"2.1\""));
468    }
469
470    #[test]
471    fn test_empty_values() {
472        let mut doc = Document::new((1, 0));
473        doc.root
474            .insert("empty".to_string(), Item::Scalar(Value::Null));
475
476        let xml = hedl_to_xml(&doc).unwrap();
477        let doc2 = xml_to_hedl(&xml).unwrap();
478
479        assert!(doc2.root.contains_key("empty"));
480    }
481
482    #[test]
483    fn test_tensor_values() {
484        use hedl_core::lex::Tensor;
485
486        let mut doc = Document::new((1, 0));
487        let tensor = Tensor::Array(vec![
488            Tensor::Scalar(1.0),
489            Tensor::Scalar(2.0),
490            Tensor::Scalar(3.0),
491        ]);
492        doc.root.insert(
493            "tensor".to_string(),
494            Item::Scalar(Value::Tensor(Box::new(tensor))),
495        );
496
497        let xml = hedl_to_xml(&doc).unwrap();
498        assert!(xml.contains("<tensor>"));
499        assert!(xml.contains("<item>"));
500    }
501
502    #[test]
503    fn test_infer_lists_config() {
504        let xml = r#"<?xml version="1.0" encoding="UTF-8"?>
505        <hedl>
506            <user id="1"><name>Alice</name></user>
507            <user id="2"><name>Bob</name></user>
508        </hedl>"#;
509
510        let config = FromXmlConfig {
511            infer_lists: true,
512            ..Default::default()
513        };
514
515        let doc = from_xml(xml, &config).unwrap();
516
517        // Should infer a list from repeated <user> elements
518        assert!(doc.root.contains_key("user"));
519        if let Some(Item::List(list)) = doc.root.get("user") {
520            assert_eq!(list.rows.len(), 2);
521        }
522    }
523
524    #[test]
525    fn test_attributes_as_values() {
526        let xml = r#"<?xml version="1.0" encoding="UTF-8"?>
527        <hedl>
528            <item id="123" name="test" active="true"/>
529        </hedl>"#;
530
531        let config = FromXmlConfig::default();
532        let doc = from_xml(xml, &config).unwrap();
533
534        assert!(doc.root.contains_key("item"));
535        if let Some(Item::Object(obj)) = doc.root.get("item") {
536            // "123" is inferred as an integer (type inference is correct)
537            assert_eq!(
538                obj.get("id").and_then(|i| i.as_scalar()),
539                Some(&Value::Int(123))
540            );
541            assert_eq!(
542                obj.get("name").and_then(|i| i.as_scalar()),
543                Some(&Value::String("test".to_string().into()))
544            );
545            assert_eq!(
546                obj.get("active").and_then(|i| i.as_scalar()),
547                Some(&Value::Bool(true))
548            );
549        }
550    }
551}