Skip to main content

hedl_xml/from_xml/
parser.rs

1// Dweve HEDL - Hierarchical Entity Data Language
2//
3// Copyright (c) 2025 Dweve IP B.V. and individual contributors.
4//
5// SPDX-License-Identifier: Apache-2.0
6//
7// Licensed under the Apache License, Version 2.0 (the "License");
8// you may not use this file except in compliance with the License.
9// You may obtain a copy of the License in the LICENSE file at the
10// root of this repository or at: http://www.apache.org/licenses/LICENSE-2.0
11//
12// Unless required by applicable law or agreed to in writing, software
13// distributed under the License is distributed on an "AS IS" BASIS,
14// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15// See the License for the specific language governing permissions and
16// limitations under the License.
17
18//! XML parsing functions for HEDL conversion
19
20use super::config::{EntityPolicy, FromXmlConfig};
21use super::conversion::{
22    items_are_list_elements, items_are_tensor_elements, items_to_list,
23    items_to_matrix_list_with_type, items_to_tensor, to_hedl_key,
24};
25use super::values::{parse_value_with_config, parse_version};
26use hedl_core::convert::parse_reference;
27use hedl_core::lex::singularize_and_capitalize;
28use hedl_core::{Document, Item, Value};
29use quick_xml::events::Event;
30use quick_xml::Reader;
31use std::collections::BTreeMap;
32
33/// Maximum recursion depth for XML parsing (prevents stack overflow).
34const MAX_RECURSION_DEPTH: usize = 100;
35
36/// Convert XML string to HEDL Document
37pub fn from_xml(xml: &str, config: &FromXmlConfig) -> Result<Document, String> {
38    // Pre-scan for DOCTYPE declarations if strict policy
39    if config.entity_policy == EntityPolicy::RejectDtd
40        && (xml.contains("<!DOCTYPE") || xml.contains("<!ENTITY"))
41    {
42        return Err("DOCTYPE declarations rejected by entity policy (XXE prevention)".to_string());
43    }
44
45    let mut reader = Reader::from_str(xml);
46    // Note: trim_text disabled to preserve whitespace around entity references
47    // In quick-xml 0.38+, entities like &amp; are separate Event::GeneralRef events
48    reader.config_mut().trim_text(false);
49
50    let mut doc = Document::new(config.version);
51
52    // Skip XML declaration and find root element
53    loop {
54        match reader.read_event() {
55            Ok(Event::DocType(e)) => {
56                if config.log_security_events {
57                    eprintln!(
58                        "[SECURITY] DTD detected in XML input at position {}: {:?}",
59                        reader.buffer_position(),
60                        String::from_utf8_lossy(&e)
61                    );
62                }
63
64                match config.entity_policy {
65                    EntityPolicy::RejectDtd => {
66                        return Err(format!(
67                            "DOCTYPE declaration rejected at position {} (XXE prevention policy)",
68                            reader.buffer_position()
69                        ));
70                    }
71                    EntityPolicy::WarnOnEntities => {
72                        eprintln!(
73                            "[WARNING] DOCTYPE detected in XML. External entities are NOT processed by quick-xml."
74                        );
75                    }
76                    EntityPolicy::AllowDtdNoExternal => {
77                        // Continue parsing, entities won't be resolved anyway
78                    }
79                }
80            }
81            Ok(Event::Start(e)) | Ok(Event::Empty(e)) => {
82                let name = String::from_utf8_lossy(e.name().as_ref()).to_string();
83
84                // Parse version from root if present
85                for attr in e.attributes().flatten() {
86                    let key = String::from_utf8_lossy(attr.key.as_ref()).to_string();
87                    let value = String::from_utf8_lossy(&attr.value).to_string();
88                    if key == "version" {
89                        if let Some((major, minor)) = parse_version(&value) {
90                            doc.version = (major, minor);
91                        }
92                    }
93                }
94
95                // Parse root content
96                doc.root = parse_children(&mut reader, &name, config, &mut doc.structs, 0)?;
97                break;
98            }
99            Ok(Event::Eof) => break,
100            Err(e) => {
101                return Err(format!(
102                    "XML parse error at position {}: {}",
103                    reader.buffer_position(),
104                    e
105                ))
106            }
107            _ => {}
108        }
109    }
110
111    Ok(doc)
112}
113
114pub(crate) fn parse_children(
115    reader: &mut Reader<&[u8]>,
116    parent_name: &str,
117    config: &FromXmlConfig,
118    structs: &mut BTreeMap<String, Vec<String>>,
119    depth: usize,
120) -> Result<BTreeMap<String, Item>, String> {
121    // Security: Prevent stack overflow via deep recursion
122    if depth > MAX_RECURSION_DEPTH {
123        return Err(format!(
124            "XML recursion depth exceeded (max: {})",
125            MAX_RECURSION_DEPTH
126        ));
127    }
128    let mut children = BTreeMap::new();
129    // Track element items and explicit type attributes
130    let mut element_counts: BTreeMap<String, (Vec<Item>, Option<String>)> = BTreeMap::new();
131
132    loop {
133        match reader.read_event() {
134            Ok(Event::Start(e)) => {
135                let raw_name = String::from_utf8_lossy(e.name().as_ref()).to_string();
136                let name = to_hedl_key(&raw_name);
137
138                // Extract explicit type attribute if it looks like HEDL metadata
139                // HEDL type names are PascalCase (e.g., type="Company" on <companies>)
140                let explicit_type = e.attributes().flatten().find_map(|attr| {
141                    let key = String::from_utf8_lossy(attr.key.as_ref());
142                    if key == "type" {
143                        let value = String::from_utf8_lossy(&attr.value).to_string();
144                        // Only treat as HEDL metadata if value looks like a type name
145                        // (starts with uppercase letter)
146                        if value
147                            .chars()
148                            .next()
149                            .map(|c| c.is_ascii_uppercase())
150                            .unwrap_or(false)
151                        {
152                            Some(value)
153                        } else {
154                            None
155                        }
156                    } else {
157                        None
158                    }
159                });
160
161                let elem_owned = e.to_owned();
162                let item = parse_element(reader, &elem_owned, config, structs, depth + 1)?;
163
164                // Track repeated elements for list inference
165                if config.infer_lists {
166                    let entry = element_counts
167                        .entry(name.clone())
168                        .or_insert((Vec::new(), None));
169                    entry.0.push(item);
170                    // Store explicit type if provided (first one wins)
171                    if entry.1.is_none() && explicit_type.is_some() {
172                        entry.1 = explicit_type;
173                    }
174                } else {
175                    // ISSUE 2 FIX: Detect duplicate elements when infer_lists is false
176                    if children.contains_key(&name) {
177                        return Err(format!(
178                            "Duplicate element '{}' found with infer_lists=false. \
179                             Enable infer_lists to automatically collect duplicates into a list.",
180                            name
181                        ));
182                    }
183                    children.insert(name, item);
184                }
185            }
186            Ok(Event::Empty(e)) => {
187                let raw_name = String::from_utf8_lossy(e.name().as_ref()).to_string();
188                let name = to_hedl_key(&raw_name);
189
190                // Extract explicit type attribute if it looks like HEDL metadata
191                // HEDL type names are PascalCase (e.g., type="Company" on <companies>)
192                let explicit_type = e.attributes().flatten().find_map(|attr| {
193                    let key = String::from_utf8_lossy(attr.key.as_ref());
194                    if key == "type" {
195                        let value = String::from_utf8_lossy(&attr.value).to_string();
196                        // Only treat as HEDL metadata if value looks like a type name
197                        // (starts with uppercase letter)
198                        if value
199                            .chars()
200                            .next()
201                            .map(|c| c.is_ascii_uppercase())
202                            .unwrap_or(false)
203                        {
204                            Some(value)
205                        } else {
206                            None
207                        }
208                    } else {
209                        None
210                    }
211                });
212
213                let elem_owned = e.to_owned();
214                let item = parse_empty_element(&elem_owned, config)?;
215
216                if config.infer_lists {
217                    let entry = element_counts
218                        .entry(name.clone())
219                        .or_insert((Vec::new(), None));
220                    entry.0.push(item);
221                    if entry.1.is_none() && explicit_type.is_some() {
222                        entry.1 = explicit_type;
223                    }
224                } else {
225                    // ISSUE 2 FIX: Detect duplicate elements when infer_lists is false
226                    if children.contains_key(&name) {
227                        return Err(format!(
228                            "Duplicate element '{}' found with infer_lists=false. \
229                             Enable infer_lists to automatically collect duplicates into a list.",
230                            name
231                        ));
232                    }
233                    children.insert(name, item);
234                }
235            }
236            Ok(Event::End(e)) => {
237                let name = String::from_utf8_lossy(e.name().as_ref()).to_string();
238                if name == parent_name {
239                    break;
240                }
241            }
242            Ok(Event::Eof) => break,
243            Err(e) => return Err(format!("XML parse error: {}", e)),
244            _ => {}
245        }
246    }
247
248    // Process element counts to infer lists
249    if config.infer_lists {
250        for (name, (items, explicit_type)) in element_counts {
251            if items.len() > 1 {
252                // Multiple elements - convert to matrix list
253                let list =
254                    items_to_matrix_list_with_type(&name, items, explicit_type, config, structs)?;
255                children.insert(name, Item::List(list));
256            } else if let Some(item) = items.into_iter().next() {
257                // Single item - but check if we have explicit type metadata
258                if let Some(ref type_name) = explicit_type {
259                    // Explicit type attribute means this should be a list
260                    // The item might be wrapped: <companies type="Company"><company>...</company></companies>
261                    // We need to extract the inner <company> element(s)
262                    if let Item::Object(inner) = &item {
263                        // Look for a child that matches the expected singular form
264                        let expected_child = type_name.to_lowercase();
265                        if let Some((_, child_item)) = inner
266                            .iter()
267                            .find(|(k, _)| k.to_lowercase() == expected_child)
268                        {
269                            match child_item {
270                                Item::List(list) => {
271                                    // Already a list - use it directly but update type name
272                                    let mut new_list = list.clone();
273                                    new_list.type_name = type_name.clone();
274                                    structs.insert(type_name.clone(), new_list.schema.clone());
275                                    children.insert(name, Item::List(new_list));
276                                    continue;
277                                }
278                                Item::Object(obj) => {
279                                    // Single object - wrap it in a list
280                                    let list = items_to_matrix_list_with_type(
281                                        &expected_child,
282                                        vec![Item::Object(obj.clone())],
283                                        Some(type_name.clone()),
284                                        config,
285                                        structs,
286                                    )?;
287                                    children.insert(name, Item::List(list));
288                                    continue;
289                                }
290                                _ => {}
291                            }
292                        }
293                    }
294                    // Fallback: wrap the single item as a list
295                    let list = items_to_matrix_list_with_type(
296                        &name,
297                        vec![item],
298                        explicit_type,
299                        config,
300                        structs,
301                    )?;
302                    children.insert(name, Item::List(list));
303                } else {
304                    children.insert(name, item);
305                }
306            }
307        }
308    }
309
310    Ok(children)
311}
312
313pub(crate) fn parse_element(
314    reader: &mut Reader<&[u8]>,
315    elem: &quick_xml::events::BytesStart<'_>,
316    config: &FromXmlConfig,
317    structs: &mut BTreeMap<String, Vec<String>>,
318    depth: usize,
319) -> Result<Item, String> {
320    // Security: Prevent stack overflow via deep recursion
321    if depth > MAX_RECURSION_DEPTH {
322        return Err(format!(
323            "XML recursion depth exceeded (max: {})",
324            MAX_RECURSION_DEPTH
325        ));
326    }
327    let name = String::from_utf8_lossy(elem.name().as_ref()).to_string();
328
329    // Extract attributes (convert keys to valid HEDL format)
330    let mut attributes = BTreeMap::new();
331    let mut is_reference = false;
332    for attr in elem.attributes().flatten() {
333        let raw_key = String::from_utf8_lossy(attr.key.as_ref()).to_string();
334        let value = String::from_utf8_lossy(&attr.value).to_string();
335
336        // Check for HEDL marker attributes
337        if raw_key == "__hedl_type__" {
338            if value == "ref" {
339                is_reference = true;
340            }
341            continue; // Don't include in regular attributes
342        }
343        if raw_key == "__hedl_child__" {
344            continue; // Skip child marker, it's handled separately
345        }
346        // Note: "type" attribute is NOT skipped here since it may be regular data.
347        // List type inference is handled separately in parse_children using explicit_type.
348
349        let key = to_hedl_key(&raw_key);
350        attributes.insert(key, value);
351    }
352
353    // Parse content
354    let mut text_content = String::new();
355    let mut child_elements: BTreeMap<String, Vec<Item>> = BTreeMap::new();
356    let mut marked_children: BTreeMap<String, Vec<Item>> = BTreeMap::new(); // Elements with __hedl_child__
357    let mut has_children = false;
358
359    loop {
360        match reader.read_event() {
361            Ok(Event::Start(e)) => {
362                has_children = true;
363                let raw_child_name = String::from_utf8_lossy(e.name().as_ref()).to_string();
364                let child_name = to_hedl_key(&raw_child_name);
365
366                // Check for __hedl_child__ marker attribute
367                let is_marked_child = e.attributes().any(|attr| {
368                    if let Ok(attr) = attr {
369                        let key = String::from_utf8_lossy(attr.key.as_ref());
370                        let val = String::from_utf8_lossy(&attr.value);
371                        key == "__hedl_child__" && val == "true"
372                    } else {
373                        false
374                    }
375                });
376
377                let elem_owned = e.to_owned();
378                let child_item = parse_element(reader, &elem_owned, config, structs, depth + 1)?;
379
380                if is_marked_child {
381                    marked_children
382                        .entry(raw_child_name)
383                        .or_default()
384                        .push(child_item);
385                } else {
386                    child_elements
387                        .entry(child_name)
388                        .or_default()
389                        .push(child_item);
390                }
391            }
392            Ok(Event::Empty(e)) => {
393                has_children = true;
394                let raw_child_name = String::from_utf8_lossy(e.name().as_ref()).to_string();
395                let child_name = to_hedl_key(&raw_child_name);
396
397                // Check for __hedl_child__ marker attribute
398                let is_marked_child = e.attributes().any(|attr| {
399                    if let Ok(attr) = attr {
400                        let key = String::from_utf8_lossy(attr.key.as_ref());
401                        let val = String::from_utf8_lossy(&attr.value);
402                        key == "__hedl_child__" && val == "true"
403                    } else {
404                        false
405                    }
406                });
407
408                let elem_owned = e.to_owned();
409                let child_item = parse_empty_element(&elem_owned, config)?;
410
411                if is_marked_child {
412                    marked_children
413                        .entry(raw_child_name)
414                        .or_default()
415                        .push(child_item);
416                } else {
417                    child_elements
418                        .entry(child_name)
419                        .or_default()
420                        .push(child_item);
421                }
422            }
423            Ok(Event::Text(e)) => {
424                let content = e
425                    .xml_content()
426                    .map_err(|e| format!("Text decode error: {}", e))?;
427                text_content.push_str(&content);
428            }
429            Ok(Event::GeneralRef(e)) => {
430                // Handle entity references (quick-xml 0.38+ reports these as separate events)
431                let ref_name = e.decode().map_err(|e| format!("Ref decode error: {}", e))?;
432                let unescaped = match ref_name.as_ref() {
433                    "amp" => "&",
434                    "lt" => "<",
435                    "gt" => ">",
436                    "quot" => "\"",
437                    "apos" => "'",
438                    _ => return Err(format!("Unknown entity reference: {}", ref_name)),
439                };
440                text_content.push_str(unescaped);
441            }
442            Ok(Event::End(e)) => {
443                let end_name = String::from_utf8_lossy(e.name().as_ref()).to_string();
444                if end_name == name {
445                    break;
446                }
447            }
448            Ok(Event::Eof) => break,
449            Err(e) => return Err(format!("XML parse error: {}", e)),
450            _ => {}
451        }
452    }
453
454    // Determine item type
455    if has_children {
456        // Convert collected child elements, inferring lists for repeated elements
457        let mut result_children = BTreeMap::new();
458        for (child_name, items) in child_elements {
459            if items.len() > 1 {
460                if config.infer_lists {
461                    // Check element type and convert appropriately:
462                    // 1. Numeric scalars -> tensor (numeric arrays with [...] syntax)
463                    // 2. Non-numeric scalars -> list (non-numeric arrays with (...) syntax)
464                    // 3. Objects -> matrix list
465                    //
466                    // IMPORTANT: Check tensor first, because numeric values are also valid list elements,
467                    // but we want to preserve backward compatibility where numeric <item> elements
468                    // become tensors, not lists.
469                    if child_name == "item" && items_are_tensor_elements(&items) {
470                        // Convert to tensor (numeric arrays with [...] syntax)
471                        let tensor = items_to_tensor(&items)?;
472                        result_children
473                            .insert(child_name, Item::Scalar(Value::Tensor(Box::new(tensor))));
474                    } else if child_name == "item" && items_are_list_elements(&items) {
475                        // Convert to list (non-numeric arrays with (...) syntax)
476                        let list = items_to_list(&items)?;
477                        result_children
478                            .insert(child_name, Item::Scalar(Value::List(Box::new(list))));
479                    } else {
480                        // Multiple elements with same name - convert to matrix list
481                        use super::conversion::items_to_matrix_list;
482                        let list = items_to_matrix_list(&child_name, items, config, structs)?;
483                        result_children.insert(child_name, Item::List(list));
484                    }
485                } else {
486                    // ISSUE 2 FIX: Error when duplicates found with infer_lists=false
487                    return Err(format!(
488                        "Duplicate element '{}' found with infer_lists=false. \
489                         Enable infer_lists to automatically collect duplicates into a list.",
490                        child_name
491                    ));
492                }
493            } else if let Some(item) = items.into_iter().next() {
494                result_children.insert(child_name, item);
495            }
496        }
497
498        // Convert marked children (elements with __hedl_child__="true") to lists
499        // These represent NEST hierarchical children that should be attached to nodes
500        for (child_type_raw, child_items) in marked_children {
501            if !child_items.is_empty() {
502                // Convert to matrix list (even a single child becomes a list)
503                use super::conversion::items_to_matrix_list;
504                let list = items_to_matrix_list(&child_type_raw, child_items, config, structs)?;
505                let child_key = to_hedl_key(&child_type_raw);
506                result_children.insert(child_key, Item::List(list));
507            }
508        }
509
510        // ISSUE 1 FIX: Merge attributes into the result object
511        for (key, value_str) in attributes {
512            let value = parse_value_with_config(&value_str, config)?;
513            result_children.insert(key, Item::Scalar(value));
514        }
515
516        // Handle mixed content (text + children/attributes)
517        if !text_content.trim().is_empty() {
518            let value = if is_reference {
519                Value::Reference(parse_reference(text_content.trim())?)
520            } else {
521                parse_value_with_config(&text_content, config)?
522            };
523            result_children.insert("_text".to_string(), Item::Scalar(value));
524        }
525
526        // Check if we should flatten: if object has single child that's a list or Value::List,
527        // and the child name is the singular of the parent name, promote the list.
528        // This handles XML patterns like:
529        //   - <users><user>...</user><user>...</user></users> -> users:@User[...]
530        //   - <roles><item>admin</item><item>editor</item></roles> -> roles: (admin, editor)
531        // BUT: don't flatten if the list has hierarchical children (NEST structures)
532        // ALSO: don't flatten if we have attributes or text content
533        if result_children.len() == 1 {
534            // SAFETY: len() == 1 guarantees at least one element
535            let (child_key, child_item) = result_children.iter().next().expect("len == 1");
536
537            // Check for MatrixList (Item::List)
538            if let Item::List(list) = child_item {
539                // Don't flatten if any rows have children (hierarchical nesting)
540                let has_nested_children = list
541                    .rows
542                    .iter()
543                    .any(|node| node.children().map(|c| !c.is_empty()).unwrap_or(false));
544                if !has_nested_children {
545                    // Check if child is singular form of parent
546                    // Compare case-insensitively because XML element names may have different casing
547                    // e.g., post_tags -> PostTag, but child element might be posttag -> Posttag
548                    let parent_singular =
549                        singularize_and_capitalize(&to_hedl_key(&name)).to_lowercase();
550                    let child_type = singularize_and_capitalize(child_key).to_lowercase();
551                    if parent_singular == child_type {
552                        // Flatten: return the list directly
553                        // SAFETY: len() == 1 guarantees at least one element
554                        return Ok(result_children.into_values().next().expect("len == 1"));
555                    }
556                }
557            }
558
559            // Check for Value::List (Item::Scalar(Value::List))
560            // This handles non-numeric arrays like (admin, editor, viewer)
561            if let Item::Scalar(Value::List(_)) = child_item {
562                // For Value::List, always flatten if the key is "item"
563                // <roles><item>x</item><item>y</item></roles> -> roles: (x, y) not roles: { item: (x, y) }
564                if child_key == "item" {
565                    // SAFETY: len() == 1 guarantees at least one element
566                    return Ok(result_children.into_values().next().expect("len == 1"));
567                }
568            }
569        }
570
571        // Object with nested elements
572        Ok(Item::Object(result_children))
573    } else if !text_content.trim().is_empty() {
574        // Scalar with text content (and possibly attributes)
575        let value = if is_reference {
576            // Explicitly marked as reference
577            Value::Reference(parse_reference(text_content.trim())?)
578        } else {
579            parse_value_with_config(&text_content, config)?
580        };
581
582        // ISSUE 1 FIX: If we have both text and attributes, create an object
583        if !attributes.is_empty() {
584            let mut obj = BTreeMap::new();
585            obj.insert("_text".to_string(), Item::Scalar(value));
586            for (key, value_str) in attributes {
587                let attr_value = parse_value_with_config(&value_str, config)?;
588                obj.insert(key, Item::Scalar(attr_value));
589            }
590            Ok(Item::Object(obj))
591        } else {
592            Ok(Item::Scalar(value))
593        }
594    } else if !attributes.is_empty() {
595        // Empty element with attributes - convert to object
596        let mut obj = BTreeMap::new();
597        for (key, value_str) in attributes {
598            let value = parse_value_with_config(&value_str, config)?;
599            obj.insert(key, Item::Scalar(value));
600        }
601        Ok(Item::Object(obj))
602    } else {
603        // Empty element - null value
604        Ok(Item::Scalar(Value::Null))
605    }
606}
607
608pub(crate) fn parse_empty_element(
609    elem: &quick_xml::events::BytesStart<'_>,
610    config: &FromXmlConfig,
611) -> Result<Item, String> {
612    let mut attributes = BTreeMap::new();
613
614    for attr in elem.attributes().flatten() {
615        let raw_key = String::from_utf8_lossy(attr.key.as_ref()).to_string();
616        let key = to_hedl_key(&raw_key);
617        let value = String::from_utf8_lossy(&attr.value).to_string();
618        attributes.insert(key, value);
619    }
620
621    if attributes.is_empty() {
622        Ok(Item::Scalar(Value::Null))
623    } else if attributes.len() == 1 && attributes.contains_key("value") {
624        // Special case: <elem value="x"/> -> scalar x
625        // SAFETY: contains_key("value") guarantees get() succeeds
626        let value_str = attributes.get("value").expect("key exists");
627        let value = parse_value_with_config(value_str, config)?;
628        Ok(Item::Scalar(value))
629    } else {
630        // Multiple attributes - convert to object
631        let mut obj = BTreeMap::new();
632        for (key, value_str) in attributes {
633            let value = parse_value_with_config(&value_str, config)?;
634            obj.insert(key, Item::Scalar(value));
635        }
636        Ok(Item::Object(obj))
637    }
638}