hedl_xml/lib.rs
1// Dweve HEDL - Hierarchical Entity Data Language
2//
3// Copyright (c) 2025 Dweve IP B.V. and individual contributors.
4//
5// SPDX-License-Identifier: Apache-2.0
6//
7// Licensed under the Apache License, Version 2.0 (the "License");
8// you may not use this file except in compliance with the License.
9// You may obtain a copy of the License in the LICENSE file at the
10// root of this repository or at: http://www.apache.org/licenses/LICENSE-2.0
11//
12// Unless required by applicable law or agreed to in writing, software
13// distributed under the License is distributed on an "AS IS" BASIS,
14// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15// See the License for the specific language governing permissions and
16// limitations under the License.
17
18//! HEDL XML Conversion
19//!
20//! Provides bidirectional conversion between HEDL documents and XML format.
21//!
22//! # Features
23//!
24//! - Convert HEDL documents to well-formed XML
25//! - Parse XML into HEDL documents with type inference
26//! - **Streaming support** for large multi-gigabyte XML files
27//! - **Async I/O** with Tokio (via `async` feature flag)
28//! - **XSD schema validation** with comprehensive error messages
29//! - **Schema caching** for high-performance validation
30//! - Configurable output formatting (pretty print, attributes)
31//! - Support for nested structures and matrix lists
32//! - Reference and expression preservation
33//!
34//! # Security
35//!
36//! ## XML External Entity (XXE) Prevention
37//!
38//! The hedl-xml crate is **protected against XXE attacks by default** through multiple layers:
39//!
40//! ### Layer 1: Safe Parser (quick-xml)
41//!
42//! The underlying [quick-xml](https://crates.io/crates/quick-xml) library does not:
43//! - Resolve external entities (file://, http://, etc.)
44//! - Process DTD entity declarations
45//! - Expand entity references defined in DOCTYPEs
46//! - Support XInclude directives
47//!
48//! This makes XXE attacks **impossible** regardless of configuration.
49//!
50//! ### Layer 2: Entity Policy Controls
51//!
52//! For defense-in-depth and compliance requirements, explicit entity policies are available:
53//!
54//! ```rust
55//! use hedl_xml::{FromXmlConfig, EntityPolicy};
56//!
57//! // Strictest: Reject any XML with DOCTYPE declarations
58//! let strict_config = FromXmlConfig::strict_security();
59//!
60//! // Default: Allow DOCTYPE but never resolve entities
61//! let default_config = FromXmlConfig::default(); // AllowDtdNoExternal
62//!
63//! // Monitoring: Warn on DTD/entity detection
64//! let warn_config = FromXmlConfig {
65//! entity_policy: EntityPolicy::WarnOnEntities,
66//! log_security_events: true,
67//! ..Default::default()
68//! };
69//! ```
70//!
71//! ### XXE Attack Vectors (Mitigated)
72//!
73//! The following XXE attack patterns are **prevented**:
74//!
75//! - **File Disclosure**: `<!ENTITY xxe SYSTEM "file:///etc/passwd">` - Not expanded
76//! - **Server-Side Request Forgery**: External HTTP entities are not resolved
77//! - **Billion Laughs DoS**: Entity definitions are ignored; no expansion occurs
78//! - **Out-of-Band Exfiltration**: Parameter entities are not resolved or executed
79//!
80//! # Examples
81//!
82//! ## Converting HEDL to XML
83//!
84//! ```rust
85//! use hedl_core::{Document, Item, Value};
86//! use hedl_xml::{to_xml, ToXmlConfig};
87//! use std::collections::BTreeMap;
88//!
89//! let mut doc = Document::new((2, 0));
90//! doc.root.insert("name".to_string(), Item::Scalar(Value::String("example".to_string().into())));
91//!
92//! let config = ToXmlConfig::default();
93//! let xml = to_xml(&doc, &config).unwrap();
94//! ```
95//!
96//! ## Converting XML to HEDL
97//!
98//! ```rust
99//! use hedl_xml::{from_xml, FromXmlConfig};
100//!
101//! let xml = r#"<?xml version="1.0"?><hedl><name>example</name></hedl>"#;
102//! let config = FromXmlConfig::default();
103//! let doc = from_xml(xml, &config).unwrap();
104//! ```
105//!
106//! ## Streaming large XML files
107//!
108//! For multi-gigabyte XML files, use the streaming API to process items incrementally
109//! without loading the entire document into memory:
110//!
111//! ```rust,no_run
112//! use hedl_xml::streaming::{from_xml_stream, StreamConfig};
113//! use std::fs::File;
114//!
115//! let file = File::open("large.xml")?;
116//! let config = StreamConfig::default();
117//!
118//! for result in from_xml_stream(file, &config)? {
119//! match result {
120//! Ok(item) => println!("Processing: {}", item.key),
121//! Err(e) => eprintln!("Error: {}", e),
122//! }
123//! }
124//! # Ok::<(), Box<dyn std::error::Error>>(())
125//! ```
126//!
127//! ## XSD Schema Validation
128//!
129//! Validate XML documents against XSD schemas:
130//!
131//! ```rust
132//! use hedl_xml::schema::SchemaValidator;
133//!
134//! let schema = r#"<?xml version="1.0"?>
135//! <xs:schema xmlns:xs="http://www.w3.org/2001/XMLSchema">
136//! <xs:element name="person">
137//! <xs:complexType>
138//! <xs:sequence>
139//! <xs:element name="name" type="xs:string"/>
140//! <xs:element name="age" type="xs:integer"/>
141//! </xs:sequence>
142//! </xs:complexType>
143//! </xs:element>
144//! </xs:schema>"#;
145//!
146//! let validator = SchemaValidator::from_xsd(schema)?;
147//!
148//! let xml = r#"<?xml version="1.0"?>
149//! <person>
150//! <name>Alice</name>
151//! <age>30</age>
152//! </person>"#;
153//!
154//! validator.validate(xml)?;
155//! # Ok::<(), Box<dyn std::error::Error>>(())
156//! ```
157//!
158//! ## Async I/O (with `async` feature)
159//!
160//! Enable async support in `Cargo.toml`:
161//!
162//! ```toml
163//! [dependencies]
164//! hedl-xml = { version = "*", features = ["async"] }
165//! tokio = { version = "1", features = ["full"] }
166//! ```
167//!
168//! Then use async functions:
169//!
170//! ```rust,no_run
171//! # #[cfg(feature = "async")]
172//! # {
173//! use hedl_xml::async_api::{from_xml_file_async, to_xml_file_async};
174//! use hedl_xml::{FromXmlConfig, ToXmlConfig};
175//!
176//! # #[tokio::main]
177//! # async fn main() -> Result<(), Box<dyn std::error::Error>> {
178//! // Read XML asynchronously
179//! let doc = from_xml_file_async("input.xml", &FromXmlConfig::default()).await?;
180//!
181//! // Process document...
182//!
183//! // Write XML asynchronously
184//! to_xml_file_async(&doc, "output.xml", &ToXmlConfig::default()).await?;
185//! # Ok(())
186//! # }
187//! # }
188//! ```
189
190#![cfg_attr(not(test), warn(missing_docs))]
191mod from_xml;
192/// XML schema support.
193pub mod schema;
194/// XML security validation.
195pub mod security;
196/// Streaming XML parsing.
197pub mod streaming;
198mod to_xml;
199
200#[cfg(feature = "async")]
201/// Async XML API.
202pub mod async_api;
203
204pub use from_xml::{from_xml, EntityPolicy, FromXmlConfig};
205pub use schema::{SchemaCache, SchemaValidator, ValidationError};
206pub use security::{SecurityViolation, XmlSecurityValidator};
207pub use streaming::{from_xml_stream, StreamConfig, StreamItem, XmlStreamingParser};
208pub use to_xml::{to_xml, ToXmlConfig};
209
210use hedl_core::Document;
211
212/// Convert HEDL document to XML string with default configuration
213pub fn hedl_to_xml(doc: &Document) -> Result<String, String> {
214 to_xml(doc, &ToXmlConfig::default())
215}
216
217/// Convert XML string to HEDL document with default configuration
218pub fn xml_to_hedl(xml: &str) -> Result<Document, String> {
219 from_xml(xml, &FromXmlConfig::default())
220}
221
222#[cfg(test)]
223mod tests {
224 use super::*;
225 use hedl_core::{Document, Item, MatrixList, Node, Reference, Value};
226 use std::collections::BTreeMap;
227
228 #[test]
229 fn test_round_trip_scalars() {
230 let mut doc = Document::new((2, 0));
231 doc.root
232 .insert("null_val".to_string(), Item::Scalar(Value::Null));
233 doc.root
234 .insert("bool_val".to_string(), Item::Scalar(Value::Bool(true)));
235 doc.root
236 .insert("int_val".to_string(), Item::Scalar(Value::Int(42)));
237 doc.root
238 .insert("float_val".to_string(), Item::Scalar(Value::Float(3.25)));
239 doc.root.insert(
240 "string_val".to_string(),
241 Item::Scalar(Value::String("hello".to_string().into())),
242 );
243
244 let xml = hedl_to_xml(&doc).unwrap();
245 let doc2 = xml_to_hedl(&xml).unwrap();
246
247 assert_eq!(
248 doc2.root.get("bool_val").and_then(|i| i.as_scalar()),
249 Some(&Value::Bool(true))
250 );
251 assert_eq!(
252 doc2.root.get("int_val").and_then(|i| i.as_scalar()),
253 Some(&Value::Int(42))
254 );
255 assert_eq!(
256 doc2.root.get("string_val").and_then(|i| i.as_scalar()),
257 Some(&Value::String("hello".to_string().into()))
258 );
259 }
260
261 #[test]
262 fn test_round_trip_object() {
263 let mut doc = Document::new((2, 0));
264 let mut inner = BTreeMap::new();
265 inner.insert(
266 "name".to_string(),
267 Item::Scalar(Value::String("test".to_string().into())),
268 );
269 inner.insert("value".to_string(), Item::Scalar(Value::Int(100)));
270 doc.root.insert("config".to_string(), Item::Object(inner));
271
272 let xml = hedl_to_xml(&doc).unwrap();
273 let doc2 = xml_to_hedl(&xml).unwrap();
274
275 let config_obj = doc2.root.get("config").and_then(|i| i.as_object()).unwrap();
276 assert_eq!(
277 config_obj.get("name").and_then(|i| i.as_scalar()),
278 Some(&Value::String("test".to_string().into()))
279 );
280 assert_eq!(
281 config_obj.get("value").and_then(|i| i.as_scalar()),
282 Some(&Value::Int(100))
283 );
284 }
285
286 #[test]
287 fn test_round_trip_reference() {
288 let mut doc = Document::new((2, 0));
289 doc.root.insert(
290 "ref1".to_string(),
291 Item::Scalar(Value::Reference(Reference::local("user123"))),
292 );
293 doc.root.insert(
294 "ref2".to_string(),
295 Item::Scalar(Value::Reference(Reference::qualified("User", "456"))),
296 );
297
298 let xml = hedl_to_xml(&doc).unwrap();
299 let doc2 = xml_to_hedl(&xml).unwrap();
300
301 assert_eq!(
302 doc2.root.get("ref1").and_then(|i| i.as_scalar()),
303 Some(&Value::Reference(Reference::local("user123")))
304 );
305 assert_eq!(
306 doc2.root.get("ref2").and_then(|i| i.as_scalar()),
307 Some(&Value::Reference(Reference::qualified("User", "456")))
308 );
309 }
310
311 #[test]
312 fn test_round_trip_expression() {
313 use hedl_core::lex::{ExprLiteral, Expression, Span};
314
315 let mut doc = Document::new((2, 0));
316 let expr = Expression::Call {
317 name: "add".to_string(),
318 args: vec![
319 Expression::Identifier {
320 name: "x".to_string(),
321 span: Span::synthetic(),
322 },
323 Expression::Literal {
324 value: ExprLiteral::Int(1),
325 span: Span::synthetic(),
326 },
327 ],
328 span: Span::synthetic(),
329 };
330 doc.root.insert(
331 "expr".to_string(),
332 Item::Scalar(Value::Expression(Box::new(expr.clone()))),
333 );
334
335 let xml = hedl_to_xml(&doc).unwrap();
336 let doc2 = xml_to_hedl(&xml).unwrap();
337
338 // Check expression is preserved (span info is lost during XML round-trip)
339 if let Some(Item::Scalar(Value::Expression(e))) = doc2.root.get("expr") {
340 // Compare string representation which ignores spans
341 assert_eq!(e.to_string(), expr.to_string());
342 } else {
343 panic!("Expected expression value");
344 }
345 }
346
347 #[test]
348 fn test_matrix_list() {
349 let mut doc = Document::new((2, 0));
350 let mut list = MatrixList::new("User", vec!["id".to_string(), "name".to_string()]);
351
352 let node1 = Node::new(
353 "User",
354 "user1",
355 vec![
356 Value::String("user1".to_string().into()),
357 Value::String("Alice".to_string().into()),
358 ],
359 );
360 let node2 = Node::new(
361 "User",
362 "user2",
363 vec![
364 Value::String("user2".to_string().into()),
365 Value::String("Bob".to_string().into()),
366 ],
367 );
368
369 list.add_row(node1);
370 list.add_row(node2);
371
372 doc.root.insert("users".to_string(), Item::List(list));
373
374 let xml = hedl_to_xml(&doc).unwrap();
375 assert!(xml.contains("<users"));
376 assert!(xml.contains("user1"));
377 assert!(xml.contains("user2"));
378 }
379
380 #[test]
381 fn test_special_characters_escaping() {
382 let mut doc = Document::new((2, 0));
383 doc.root.insert(
384 "text".to_string(),
385 Item::Scalar(Value::String(
386 "hello & goodbye <tag> \"quoted\"".to_string().into(),
387 )),
388 );
389
390 let xml = hedl_to_xml(&doc).unwrap();
391 let doc2 = xml_to_hedl(&xml).unwrap();
392
393 // XML escaping should be handled transparently
394 let original = doc.root.get("text").and_then(|i| i.as_scalar());
395 let parsed = doc2.root.get("text").and_then(|i| i.as_scalar());
396
397 assert_eq!(original, parsed);
398 }
399
400 #[test]
401 fn test_nested_objects() {
402 let mut doc = Document::new((2, 0));
403
404 let mut level2 = BTreeMap::new();
405 level2.insert(
406 "deep".to_string(),
407 Item::Scalar(Value::String("value".to_string().into())),
408 );
409
410 let mut level1 = BTreeMap::new();
411 level1.insert("nested".to_string(), Item::Object(level2));
412
413 doc.root.insert("outer".to_string(), Item::Object(level1));
414
415 let xml = hedl_to_xml(&doc).unwrap();
416 let doc2 = xml_to_hedl(&xml).unwrap();
417
418 assert!(doc2.root.contains_key("outer"));
419 }
420
421 #[test]
422 fn test_config_pretty_print() {
423 let mut doc = Document::new((2, 0));
424 doc.root.insert(
425 "test".to_string(),
426 Item::Scalar(Value::String("value".to_string().into())),
427 );
428
429 let config_pretty = ToXmlConfig {
430 pretty: true,
431 indent: " ".to_string(),
432 ..Default::default()
433 };
434
435 let config_compact = ToXmlConfig {
436 pretty: false,
437 ..Default::default()
438 };
439
440 let xml_pretty = to_xml(&doc, &config_pretty).unwrap();
441 let xml_compact = to_xml(&doc, &config_compact).unwrap();
442
443 // Pretty printed should have newlines and indentation
444 assert!(xml_pretty.len() > xml_compact.len());
445 }
446
447 #[test]
448 fn test_config_custom_root() {
449 let doc = Document::new((2, 0));
450
451 let config = ToXmlConfig {
452 root_element: "custom_root".to_string(),
453 ..Default::default()
454 };
455
456 let xml = to_xml(&doc, &config).unwrap();
457 assert!(xml.contains("<custom_root"));
458 assert!(xml.contains("</custom_root>"));
459 }
460
461 #[test]
462 fn test_config_metadata() {
463 let doc = Document::new((2, 1));
464
465 let config = ToXmlConfig {
466 include_metadata: true,
467 ..Default::default()
468 };
469
470 let xml = to_xml(&doc, &config).unwrap();
471 assert!(xml.contains("version=\"2.1\""));
472 }
473
474 #[test]
475 fn test_empty_values() {
476 let mut doc = Document::new((2, 0));
477 doc.root
478 .insert("empty".to_string(), Item::Scalar(Value::Null));
479
480 let xml = hedl_to_xml(&doc).unwrap();
481 let doc2 = xml_to_hedl(&xml).unwrap();
482
483 assert!(doc2.root.contains_key("empty"));
484 }
485
486 #[test]
487 fn test_tensor_values() {
488 use hedl_core::lex::Tensor;
489
490 let mut doc = Document::new((2, 0));
491 let tensor = Tensor::Array(vec![
492 Tensor::Scalar(1.0),
493 Tensor::Scalar(2.0),
494 Tensor::Scalar(3.0),
495 ]);
496 doc.root.insert(
497 "tensor".to_string(),
498 Item::Scalar(Value::Tensor(Box::new(tensor))),
499 );
500
501 let xml = hedl_to_xml(&doc).unwrap();
502 assert!(xml.contains("<tensor>"));
503 assert!(xml.contains("<item>"));
504 }
505
506 #[test]
507 fn test_infer_lists_config() {
508 let xml = r#"<?xml version="1.0" encoding="UTF-8"?>
509 <hedl>
510 <user id="1"><name>Alice</name></user>
511 <user id="2"><name>Bob</name></user>
512 </hedl>"#;
513
514 let config = FromXmlConfig {
515 infer_lists: true,
516 ..Default::default()
517 };
518
519 let doc = from_xml(xml, &config).unwrap();
520
521 // Should infer a list from repeated <user> elements
522 assert!(doc.root.contains_key("user"));
523 if let Some(Item::List(list)) = doc.root.get("user") {
524 assert_eq!(list.rows.len(), 2);
525 }
526 }
527
528 #[test]
529 fn test_attributes_as_values() {
530 let xml = r#"<?xml version="1.0" encoding="UTF-8"?>
531 <hedl>
532 <item id="123" name="test" active="true"/>
533 </hedl>"#;
534
535 let config = FromXmlConfig::default();
536 let doc = from_xml(xml, &config).unwrap();
537
538 assert!(doc.root.contains_key("item"));
539 if let Some(Item::Object(obj)) = doc.root.get("item") {
540 // "123" is inferred as an integer (type inference is correct)
541 assert_eq!(
542 obj.get("id").and_then(|i| i.as_scalar()),
543 Some(&Value::Int(123))
544 );
545 assert_eq!(
546 obj.get("name").and_then(|i| i.as_scalar()),
547 Some(&Value::String("test".to_string().into()))
548 );
549 assert_eq!(
550 obj.get("active").and_then(|i| i.as_scalar()),
551 Some(&Value::Bool(true))
552 );
553 }
554 }
555}