Skip to main content

azul_layout/xml/
mod.rs

1//! XML/HTML parsing module for the Azul toolkit.
2//!
3//! Provides two parsing paths:
4//! - `parse_xml_string`: builds an `XmlNode` tree (used by `domxml_from_str`)
5//! - `parse_xml_to_fast_dom_with_css`: builds an arena-based `FastDom` directly
6//!   from XML tokens (used by `parse_xml_to_styled_dom`)
7//!
8//! Both paths handle HTML5-lite features: void elements, auto-closing tags,
9//! XML entity decoding, `<style>` CSS extraction, and BOM/DOCTYPE stripping.
10//!
11//! Data types (`XmlNode`, `XmlError`, etc.) live in `azul_core::xml`; this
12//! module provides the parsing implementations.
13
14#![allow(unused_variables)]
15
16use alloc::{boxed::Box, collections::BTreeMap, string::String, vec::Vec};
17use core::fmt;
18#[cfg(feature = "std")]
19use std::path::Path;
20
21#[cfg(feature = "svg")]
22pub mod svg;
23
24/// Decodes XML/HTML entities in a string.
25/// Handles standard XML entities: &lt; &gt; &amp; &apos; &quot;
26/// and numeric character references: &#60; &#x3C;
27/// Returns Cow::Borrowed when no entities are found (zero-alloc fast path).
28fn decode_xml_entities(s: &str) -> std::borrow::Cow<'_, str> {
29    // Fast path: if no ampersand, no entities to decode
30    if !s.contains('&') {
31        return std::borrow::Cow::Borrowed(s);
32    }
33    decode_xml_entities_slow(s)
34}
35
36fn decode_xml_entities_slow(s: &str) -> std::borrow::Cow<'_, str> {
37    let mut result = String::with_capacity(s.len());
38    let mut chars = s.chars().peekable();
39    
40    while let Some(c) = chars.next() {
41        if c == '&' {
42            // Collect the entity reference
43            let mut entity = String::new();
44            let mut found_semicolon = false;
45            
46            while let Some(&next) = chars.peek() {
47                if next == ';' {
48                    chars.next();
49                    found_semicolon = true;
50                    break;
51                }
52                if !next.is_alphanumeric() && next != '#' {
53                    break;
54                }
55                entity.push(chars.next().unwrap());
56                if entity.len() > 10 {
57                    // Entity too long, not a valid entity
58                    break;
59                }
60            }
61            
62            if found_semicolon {
63                // Try to decode the entity
64                match entity.as_str() {
65                    "lt" => result.push('<'),
66                    "gt" => result.push('>'),
67                    "amp" => result.push('&'),
68                    "apos" => result.push('\''),
69                    "quot" => result.push('"'),
70                    "nbsp" => result.push('\u{00A0}'),
71                    s if s.starts_with('#') => {
72                        // Numeric character reference
73                        let num_str = &s[1..];
74                        let code_point = if num_str.starts_with('x') || num_str.starts_with('X') {
75                            // Hexadecimal
76                            u32::from_str_radix(&num_str[1..], 16).ok()
77                        } else {
78                            // Decimal
79                            num_str.parse::<u32>().ok()
80                        };
81                        if let Some(cp) = code_point {
82                            if let Some(ch) = char::from_u32(cp) {
83                                result.push(ch);
84                            } else {
85                                // Invalid code point, keep original
86                                result.push('&');
87                                result.push_str(&entity);
88                                result.push(';');
89                            }
90                        } else {
91                            // Parse failed, keep original
92                            result.push('&');
93                            result.push_str(&entity);
94                            result.push(';');
95                        }
96                    }
97                    _ => {
98                        // Unknown entity, keep original
99                        result.push('&');
100                        result.push_str(&entity);
101                        result.push(';');
102                    }
103                }
104            } else {
105                // No semicolon found, not a valid entity reference
106                result.push('&');
107                result.push_str(&entity);
108            }
109        } else {
110            result.push(c);
111        }
112    }
113    
114    std::borrow::Cow::Owned(result)
115}
116
117pub use azul_core::xml::*;
118use azul_core::{dom::Dom, impl_from, styled_dom::StyledDom, window::StringPairVec};
119#[cfg(feature = "parser")]
120use azul_css::parser2::CssParseError;
121use azul_css::{css::Css, AzString, OptionString, U8Vec};
122use xmlparser::Tokenizer;
123
124#[cfg(feature = "xml")]
125pub fn domxml_from_str(xml: &str, component_map: &ComponentMap) -> DomXml {
126    let error_css = Css::empty();
127
128    let parsed = match parse_xml_string(&xml) {
129        Ok(parsed) => parsed,
130        Err(e) => {
131            return DomXml {
132                parsed_dom: {
133                    let mut dom = Dom::create_body()
134                        .with_children(vec![Dom::create_text(format!("{}", e))].into());
135                    StyledDom::create(&mut dom, error_css.clone())
136                },
137            };
138        }
139    };
140
141    let parsed_dom = match str_to_dom(parsed.as_ref(), component_map, None) {
142        Ok(o) => o,
143        Err(e) => {
144            return DomXml {
145                parsed_dom: {
146                    let mut dom = Dom::create_body()
147                        .with_children(vec![Dom::create_text(format!("{}", e))].into());
148                    StyledDom::create(&mut dom, error_css.clone())
149                },
150            };
151        }
152    };
153
154    DomXml { parsed_dom }
155}
156
157/// Fastest path: parse XML string directly into FastDom without intermediate XmlNode tree.
158/// Feeds XML tokenizer events directly into CompactDomBuilder, skipping both the
159/// XmlNode tree construction AND the Dom tree construction.
160/// Parse XML string directly into a `FastDom` (arena-based DOM) in a single pass.
161///
162/// Also extracts `<style>` tag content as CSS. Returns both the FastDom and
163/// collected CSS stylesheets. No intermediate `XmlNode` tree is built.
164///
165/// This is the fastest XML→DOM path: XML tokens feed directly into
166/// `CompactDomBuilder`, and `<style>` text is collected inline.
167pub fn parse_xml_to_fast_dom(xml: &str) -> Result<azul_core::dom::FastDom, XmlError> {
168    let (fast_dom, _css) = parse_xml_to_fast_dom_with_css(xml)?;
169    Ok(fast_dom)
170}
171
172/// Parse XML directly into FastDom + extracted CSS, ready for StyledDom.
173pub fn parse_xml_to_styled_dom(xml: &str) -> Result<StyledDom, XmlError> {
174    // Optional per-phase RSS/timing breakdown.
175    // Gated on AZ_MEM_BREAKDOWN=1 — prints
176    //   [XML] tokenize+fast_dom       : +XX MiB in YY ms
177    //   [XML] css attach              : +XX MiB in YY ms
178    //   [XML] create_from_fast_dom    : +XX MiB in YY ms
179    // to locate which sub-phase of the parse-cascade dominates the
180    // RSS jump seen between `page start` and `xml parsed`.
181    static MEM_ENABLED: std::sync::OnceLock<bool> = std::sync::OnceLock::new();
182    let mem_on = *MEM_ENABLED.get_or_init(azul_core::profile::memory_enabled);
183
184    let rss0 = if mem_on { peak_rss_bytes() } else { 0 };
185    let (mut fast_dom, css) = parse_xml_to_fast_dom_with_css(xml)?;
186    if mem_on {
187        let rss1 = peak_rss_bytes();
188        eprintln!(
189            "[XML] tokenize+fast_dom       : +{:.2} MiB",
190            (rss1.saturating_sub(rss0)) as f64 / 1024.0 / 1024.0,
191        );
192    }
193
194    let rss1 = if mem_on { peak_rss_bytes() } else { 0 };
195    // Attach CSS to the FastDom
196    if !css.is_empty() {
197        let combined_css = Css::new(css.into_iter()
198            .flat_map(|c| c.rules.into_library_owned_vec())
199            .collect());
200        fast_dom.css = vec![azul_core::dom::CssWithNodeId {
201            node_id: 0, // global scope
202            css: combined_css,
203        }].into();
204    }
205    if mem_on {
206        let rss2 = peak_rss_bytes();
207        eprintln!(
208            "[XML] css attach              : +{:.2} MiB",
209            (rss2.saturating_sub(rss1)) as f64 / 1024.0 / 1024.0,
210        );
211    }
212
213    // Hint the allocator to return pages freed by the CSS parser.
214    // The tokenizer+parser created many small allocations (selectors,
215    // declarations, strings) that are now packed into FastDom. Purging
216    // here returns those pages before the cascade allocates more.
217    crate::probe::hint_purge_allocator();
218
219    let rss2 = if mem_on { peak_rss_bytes() } else { 0 };
220    let styled = StyledDom::create_from_fast_dom(fast_dom);
221
222    // Major purge point: the cascade just freed ~3 MiB of intermediate
223    // allocations (build-phase Vecs, CSS selector matching state, pruned
224    // properties). Tell the allocator to return those pages NOW before
225    // the layout pass allocates more on top of them.
226    crate::probe::hint_purge_allocator();
227
228    if mem_on {
229        let rss3 = peak_rss_bytes();
230        eprintln!(
231            "[XML] create_from_fast_dom    : +{:.2} MiB",
232            (rss3.saturating_sub(rss2)) as f64 / 1024.0 / 1024.0,
233        );
234    }
235
236    Ok(styled)
237}
238
239/// Resident-set bytes for RSS checkpoints — mirrors servo-shot's
240/// `peak_rss_bytes()`. Uses `getrusage(RUSAGE_SELF)` via the
241/// `probe` feature's `libc` dep; returns 0 without it so the
242/// caller just doesn't emit meaningful deltas.
243#[cfg(all(unix, feature = "probe"))]
244fn peak_rss_bytes() -> u64 {
245    let mut usage: libc::rusage = unsafe { std::mem::zeroed() };
246    if unsafe { libc::getrusage(libc::RUSAGE_SELF, &mut usage) } != 0 {
247        return 0;
248    }
249    let ru = usage.ru_maxrss as u64;
250    // macOS reports bytes, Linux reports KiB.
251    #[cfg(target_os = "macos")]
252    { ru }
253    #[cfg(not(target_os = "macos"))]
254    { ru.saturating_mul(1024) }
255}
256
257#[cfg(not(all(unix, feature = "probe")))]
258fn peak_rss_bytes() -> u64 {
259    0
260}
261
262/// Internal: parse XML into FastDom + collected CSS stylesheets.
263fn parse_xml_to_fast_dom_with_css(xml: &str) -> Result<(azul_core::dom::FastDom, Vec<Css>), XmlError> {
264    use xmlparser::{ElementEnd::*, Token::*, Tokenizer};
265    use azul_core::dom::{NodeData, NodeType, IdOrClass, TabIndex};
266    use azul_core::xml::CompactDomBuilder;
267
268    // Strip BOM
269    let xml = xml.strip_prefix('\u{FEFF}').unwrap_or(xml);
270    let mut xml = xml.trim();
271
272    // Skip <?xml ... ?>
273    if xml.starts_with("<?") {
274        if let Some(pos) = xml.find("?>") {
275            xml = &xml[(pos + 2)..];
276        }
277    }
278
279    // Skip <!DOCTYPE ...>
280    let mut xml = xml.trim();
281    if xml.len() > 9 && xml[..9].to_ascii_lowercase().starts_with("<!doctype") {
282        if let Some(pos) = xml.find(">") {
283            xml = &xml[(pos + 1)..];
284        }
285    } else if xml.starts_with("<!--") {
286        if let Some(end) = xml.find("-->") {
287            xml = &xml[(end + 3)..];
288            xml = xml.trim();
289        }
290    }
291
292    let tokenizer = Tokenizer::from_fragment(xml, 0..xml.len());
293
294    const ESTIMATED_BYTES_PER_NODE: usize = 20;
295    let estimated_nodes = xml.len() / ESTIMATED_BYTES_PER_NODE;
296    let mut builder = CompactDomBuilder::with_capacity(estimated_nodes);
297    let mut collected_css: Vec<Css> = Vec::new();
298    let mut inside_style_tag = false;
299    let mut style_text = String::new();
300    // Track <head> depth: skip DOM nodes inside <head> (still collect <style> CSS).
301    // This ensures the FastDom contains only <html><body>... as the layout engine expects.
302    let mut head_depth: usize = 0;
303
304    // Temporary storage for current element's attributes
305    let mut current_tag: String = String::new();
306    let mut current_attrs: Vec<(String, String)> = Vec::new();
307    let mut pending_open = false;
308
309    const VOID_ELEMENTS: &[&str] = &[
310        "area", "base", "br", "col", "embed", "hr", "img", "input", "link", "meta",
311        "param", "source", "track", "wbr",
312    ];
313
314    // Pre-compute the CSS key map once (used for style= attribute parsing)
315    let css_key_map = azul_css::props::property::get_css_key_map();
316
317    // One bump arena for every AzString produced during this parse —
318    // id/class tokens, text nodes, etc. Replaces ~1k small heap allocs
319    // with a handful of 64 KiB chunks. Each AzString carries its own
320    // Arc reference to the arena, so the arena survives until the last
321    // string is dropped (typically when the StyledDom is dropped).
322    let mut str_arena = azul_css::corety::StringArena::new();
323
324    // Finalize the pending open element: create NodeData from tag + attrs, push to builder
325    // tag is already lowercase
326    let finalize_open = |
327        builder: &mut CompactDomBuilder,
328        str_arena: &mut azul_css::corety::StringArena,
329        tag: &str,
330        attrs: &[(String, String)],
331        css_key_map: &azul_css::props::property::CssKeyMap,
332    | {
333        let node_type = azul_core::xml::tag_to_node_type(tag);
334        let mut nd = NodeData::create_node(node_type);
335
336        // Apply attributes — build AttributeTypeVec directly (avoids the
337        // clone + retain dance in set_ids_and_classes for fresh NodeData).
338        let mut attr_vec: Vec<azul_core::dom::AttributeType> = Vec::new();
339        for (key, value) in attrs {
340            match key.as_str() {
341                "id" => {
342                    for id in value.split_whitespace() {
343                        attr_vec.push(azul_core::dom::AttributeType::Id(str_arena.intern(id)));
344                    }
345                }
346                "class" => {
347                    for class in value.split_whitespace() {
348                        attr_vec.push(azul_core::dom::AttributeType::Class(str_arena.intern(class)));
349                    }
350                }
351                "focusable" => {
352                    if let Some(f) = azul_core::xml::parse_bool(value.as_str()) {
353                        nd.set_tab_index(if f { TabIndex::Auto } else { TabIndex::NoKeyboardFocus });
354                    }
355                }
356                "tabindex" => {
357                    if let Ok(ti) = value.parse::<isize>() {
358                        match ti {
359                            0 => nd.set_tab_index(TabIndex::Auto),
360                            i if i > 0 => nd.set_tab_index(TabIndex::OverrideInParent(i as u32)),
361                            _ => nd.set_tab_index(TabIndex::NoKeyboardFocus),
362                        }
363                    }
364                }
365                "style" => {
366                    let mut css_attrs = Vec::new();
367                    for s in value.split(";") {
368                        let mut s = s.split(":");
369                        let key = match s.next() { Some(s) => s, None => continue };
370                        let val = match s.next() { Some(s) => s, None => continue };
371                        let _ = azul_css::parser2::parse_css_declaration(
372                            key.trim(), val.trim(),
373                            azul_css::parser2::ErrorLocationRange::default(),
374                            css_key_map, &mut Vec::new(), &mut css_attrs,
375                        );
376                    }
377                    let props = css_attrs.into_iter().filter_map(|s| {
378                        use azul_css::css::CssDeclaration;
379                        use azul_css::dynamic_selector::CssPropertyWithConditions;
380                        match s {
381                            CssDeclaration::Static(s) => Some(CssPropertyWithConditions::simple(s)),
382                            _ => None,
383                        }
384                    }).collect::<Vec<_>>();
385                    if !props.is_empty() {
386                        nd.set_css_props(props.into());
387                    }
388                }
389                "contenteditable" => {
390                    if azul_core::xml::parse_bool(value.as_str()).unwrap_or(false) {
391                        nd.set_contenteditable(true);
392                    }
393                }
394                _ => {}
395            }
396        }
397        if !attr_vec.is_empty() {
398            nd.set_attributes(attr_vec.into());
399        }
400
401        builder.open_node(nd);
402    };
403
404    let mut last_was_void = false;
405    let mut tag_stack: Vec<String> = Vec::new(); // for matching close tags
406
407    // Lowercase `src` into `dst`, reusing `dst`'s existing capacity.
408    // Zero-alloc when dst's capacity is already ≥ src.len() AND no uppercase
409    // conversion is needed (the happy path for HTML5 where tags are lowercase).
410    fn lowercase_into(dst: &mut String, src: &str) {
411        dst.clear();
412        if src.bytes().all(|b| !b.is_ascii_uppercase()) {
413            dst.push_str(src);
414        } else {
415            dst.reserve(src.len());
416            for b in src.bytes() {
417                dst.push(b.to_ascii_lowercase() as char);
418            }
419        }
420    }
421
422    for token in tokenizer {
423        let token = token.map_err(|e| XmlError::ParserError(translate_xmlparser_error(e)))?;
424        match token {
425            ElementStart { local, .. } => {
426                // Flush any pending open element
427                if pending_open {
428                    let is_void = VOID_ELEMENTS.contains(&current_tag.as_str());
429                    if current_tag == "head" { head_depth += 1; }
430                    if head_depth == 0 {
431                        finalize_open(&mut builder, &mut str_arena, &current_tag, &current_attrs, &css_key_map);
432                        if is_void { builder.close_node(); }
433                    }
434                    if !is_void {
435                        tag_stack.push(core::mem::take(&mut current_tag));
436                    }
437                }
438
439                // Reuse the current_tag buffer — avoids ~1023 fresh String
440                // allocations per parse (one per ElementStart).
441                lowercase_into(&mut current_tag, local.as_str());
442                current_attrs.clear();
443                pending_open = true;
444                last_was_void = VOID_ELEMENTS.contains(&current_tag.as_str());
445            }
446            Attribute { local, value, .. } => {
447                // decode_xml_entities returns Cow::Borrowed when no entities
448                // are present (the common case), so `.into_owned()` is the
449                // only fresh allocation here. The key is copied via
450                // `to_string()` because we can't hold a borrow across token
451                // iterations. TODO: when we switch current_attrs to
452                // Vec<(&str, Cow<str>)> this becomes zero-alloc for the key.
453                current_attrs.push((local.to_string(), decode_xml_entities(value.as_str()).into_owned()));
454            }
455            ElementEnd { end: Open, .. } => {
456                if pending_open {
457                    let is_void = VOID_ELEMENTS.contains(&current_tag.as_str());
458                    if current_tag == "style" {
459                        inside_style_tag = true;
460                        style_text.clear();
461                    }
462                    if current_tag == "head" { head_depth += 1; }
463                    if head_depth == 0 {
464                        finalize_open(&mut builder, &mut str_arena, &current_tag, &current_attrs, &css_key_map);
465                        if is_void { builder.close_node(); }
466                    }
467                    if !is_void {
468                        // Use take() instead of clone() — after pending_open=false,
469                        // current_tag is not read again until the next ElementStart
470                        // reassigns it via lowercase_into.
471                        tag_stack.push(core::mem::take(&mut current_tag));
472                    }
473                    pending_open = false;
474                }
475            }
476            ElementEnd { end: Empty, .. } => {
477                // Self-closing element: open + immediately close
478                if pending_open {
479                    if current_tag == "head" { head_depth += 1; }
480                    if head_depth == 0 {
481                        finalize_open(&mut builder, &mut str_arena, &current_tag, &current_attrs, &css_key_map);
482                        builder.close_node();
483                    }
484                    if current_tag == "head" && head_depth > 0 { head_depth -= 1; }
485                    pending_open = false;
486                }
487            }
488            ElementEnd { end: Close(_, close_value), .. } => {
489                if pending_open {
490                    let is_void = VOID_ELEMENTS.contains(&current_tag.as_str());
491                    if current_tag == "head" { head_depth += 1; }
492                    if head_depth == 0 {
493                        finalize_open(&mut builder, &mut str_arena, &current_tag, &current_attrs, &css_key_map);
494                        if is_void { builder.close_node(); }
495                    }
496                    if !is_void {
497                        tag_stack.push(core::mem::take(&mut current_tag));
498                    }
499                    pending_open = false;
500                }
501
502                let close_lower = close_value.as_str().to_ascii_lowercase();
503                let close_str = close_lower.as_str();
504                if VOID_ELEMENTS.contains(&close_str) {
505                    continue;
506                }
507
508                // If closing a <style> tag, parse collected CSS
509                if close_str == "style" && inside_style_tag {
510                    if !style_text.is_empty() {
511                        let parsed_css = Css::from_string(core::mem::take(&mut style_text).into());
512                        collected_css.push(parsed_css);
513                    }
514                    inside_style_tag = false;
515                }
516
517                // Pop until we find matching tag
518                while let Some(top) = tag_stack.last() {
519                    let is_match = top == close_str;
520                    let was_head = top == "head";
521                    // Pop this tag (unconditionally auto-close mismatched tags)
522                    let popped = tag_stack.pop().unwrap();
523                    if popped == "head" && head_depth > 0 { head_depth -= 1; }
524                    if head_depth == 0 && !was_head {
525                        builder.close_node();
526                    }
527                    if is_match { break; }
528                }
529            }
530            Text { text } => {
531                if pending_open {
532                    let is_void = VOID_ELEMENTS.contains(&current_tag.as_str());
533                    if current_tag == "style" {
534                        inside_style_tag = true;
535                        style_text.clear();
536                    }
537                    if current_tag == "head" { head_depth += 1; }
538                    if head_depth == 0 {
539                        finalize_open(&mut builder, &mut str_arena, &current_tag, &current_attrs, &css_key_map);
540                        if is_void { builder.close_node(); }
541                    }
542                    if !is_void {
543                        tag_stack.push(current_tag.clone());
544                    }
545                    pending_open = false;
546                }
547
548                let text_str = text.as_str();
549                if !text_str.is_empty() {
550                    if inside_style_tag {
551                        style_text.push_str(text_str);
552                    } else if head_depth == 0 {
553                        // Skip whitespace-only text at <html> level (between </head> and <body>)
554                        // but keep whitespace inside <body> (it's significant for inline layout)
555                        let inside_body = tag_stack.iter().any(|t| t == "body");
556                        if inside_body || !text_str.trim().is_empty() {
557                            let decoded = decode_xml_entities(text_str);
558                            builder.add_leaf(NodeData::create_text(str_arena.intern(&decoded)));
559                        }
560                    }
561                }
562            }
563            _ => {}
564        }
565    }
566
567    // Close any remaining open elements
568    if pending_open {
569        finalize_open(&mut builder, &mut str_arena, &current_tag, &current_attrs, &css_key_map);
570    }
571    while tag_stack.pop().is_some() {
572        builder.close_node();
573    }
574
575    // Drop the arena handle explicitly. AzStrings already embedded in
576    // the FastDom keep the backing bytes alive via their cloned Arc refs.
577    drop(str_arena);
578
579    Ok((builder.finish(), collected_css))
580}
581
582/// Loads, parses and builds a DOM from an XML file
583///
584/// **Warning**: The file is reloaded from disk on every function call - do not
585/// use this in release builds! This function deliberately never fails: In an error case,
586/// the error gets rendered as a `NodeType::Label`.
587#[cfg(all(feature = "std", feature = "xml"))]
588pub fn domxml_from_file<I: AsRef<Path>>(
589    file_path: I,
590    component_map: &ComponentMap,
591) -> DomXml {
592    use std::fs;
593
594    let error_css = Css::empty();
595
596    let xml = match fs::read_to_string(file_path.as_ref()) {
597        Ok(xml) => xml,
598        Err(e) => {
599            return DomXml {
600                parsed_dom: {
601                    let mut dom = Dom::create_body()
602                        .with_children(
603                            vec![Dom::create_text(format!(
604                                "Error reading: \"{}\": {}",
605                                file_path.as_ref().to_string_lossy(),
606                                e
607                            ))]
608                            .into(),
609                        );
610                    StyledDom::create(&mut dom, error_css.clone())
611                },
612            };
613        }
614    };
615
616    domxml_from_str(&xml, component_map)
617}
618
619/// Parses the XML string into an XML tree, returns
620/// the root `<app></app>` node, with the children attached to it.
621///
622/// Since the XML allows multiple root nodes, this function returns
623/// a `Vec<XmlNode>` - which are the "root" nodes, containing all their
624/// children recursively.
625#[cfg(feature = "xml")]
626pub fn parse_xml_string(xml: &str) -> Result<Vec<XmlNodeChild>, XmlError> {
627    use xmlparser::{ElementEnd::*, Token::*, Tokenizer};
628
629    use self::XmlParseError::*;
630
631    let mut root_node = XmlNode::default();
632
633    // Strip UTF-8 BOM if present (some W3C test files have it)
634    let xml = xml.strip_prefix('\u{FEFF}').unwrap_or(xml);
635
636    // Search for "<?xml" and "?>" tags and delete them from the XML
637    let mut xml = xml.trim();
638    if xml.starts_with("<?") {
639        let pos = xml.find("?>").ok_or(XmlError::MalformedHierarchy(
640            azul_core::xml::MalformedHierarchyError {
641                expected: "<?xml".into(),
642                got: "?>".into(),
643            },
644        ))?;
645        xml = &xml[(pos + 2)..];
646    }
647
648    // Delete <!DOCTYPE ...> if necessary (case-insensitive)
649    let mut xml = xml.trim();
650    if xml.len() > 9 && xml[..9].to_ascii_lowercase().starts_with("<!doctype") {
651        let pos = xml.find(">").ok_or(XmlError::MalformedHierarchy(
652            azul_core::xml::MalformedHierarchyError {
653                expected: "<!DOCTYPE".into(),
654                got: ">".into(),
655            },
656        ))?;
657        xml = &xml[(pos + 1)..];
658    } else if xml.starts_with("<!--") {
659        // Skip HTML comments at the start
660        if let Some(end) = xml.find("-->") {
661            xml = &xml[(end + 3)..];
662            xml = xml.trim();
663        }
664    }
665
666    let tokenizer = Tokenizer::from_fragment(xml, 0..xml.len());
667
668    // OPTIMIZED: Use a stack of raw pointers to avoid O(n*d) traversal on every token.
669    // This is safe because:
670    // 1. All pointers point into `root_node` which is owned and not moved
671    // 2. We never hold multiple mutable references simultaneously
672    // 3. The stack is only used within this function
673    let mut node_stack: Vec<*mut XmlNode> = vec![&mut root_node as *mut XmlNode];
674
675    // HTML5-lite parser: List of void elements that should auto-close
676    // See: https://developer.mozilla.org/en-US/docs/Glossary/Void_element
677    const VOID_ELEMENTS: &[&str] = &[
678        "area", "base", "br", "col", "embed", "hr", "img", "input", "link", "meta", "param",
679        "source", "track", "wbr",
680    ];
681
682    // HTML5-lite parser: Elements that auto-close when certain other elements are encountered
683    // Format: (element_name, closes_when_encountering)
684    const AUTO_CLOSE_RULES: &[(&str, &[&str])] = &[
685        // List items close when encountering another list item or when parent closes
686        ("li", &["li"]),
687        // Table cells/rows have complex closing rules
688        ("td", &["td", "th", "tr"]),
689        ("th", &["td", "th", "tr"]),
690        ("tr", &["tr"]),
691        // Paragraphs close on block-level elements
692        (
693            "p",
694            &[
695                "address",
696                "article",
697                "aside",
698                "blockquote",
699                "div",
700                "dl",
701                "fieldset",
702                "footer",
703                "form",
704                "h1",
705                "h2",
706                "h3",
707                "h4",
708                "h5",
709                "h6",
710                "header",
711                "hr",
712                "main",
713                "nav",
714                "ol",
715                "p",
716                "pre",
717                "section",
718                "table",
719                "ul",
720            ],
721        ),
722        // Option closes on another option or optgroup
723        ("option", &["option", "optgroup"]),
724        ("optgroup", &["optgroup"]),
725        // DD/DT close on each other
726        ("dd", &["dd", "dt"]),
727        ("dt", &["dd", "dt"]),
728    ];
729
730    // Track which hierarchy level is a void element (shouldn't be pushed to hierarchy)
731    let mut last_was_void = false;
732
733    for token in tokenizer {
734        let token = token.map_err(|e| XmlError::ParserError(translate_xmlparser_error(e)))?;
735        match token {
736            ElementStart { local, .. } => {
737                let tag_name = local.to_string();
738                let is_void_element = VOID_ELEMENTS.contains(&tag_name.as_str());
739
740                // HTML5-lite: If last element was a void element (like <img src="...">),
741                // pop it from hierarchy before processing the new element
742                if last_was_void {
743                    node_stack.pop();
744                    last_was_void = false;
745                }
746
747                // HTML5-lite: Check if we need to auto-close the current element
748                if node_stack.len() > 1 {
749                    // SAFETY: We only access the last element, which is valid
750                    let current_element = unsafe { &*node_stack[node_stack.len() - 1] };
751                    let current_tag = current_element.node_type.as_str();
752
753                    // Check if current element should auto-close when encountering this new tag
754                    for (element, closes_on) in AUTO_CLOSE_RULES {
755                        if current_tag == *element && closes_on.contains(&tag_name.as_str()) {
756                            // Auto-close the current element
757                            node_stack.pop();
758                            break;
759                        }
760                    }
761                }
762
763                // SAFETY: We access the last element which is valid
764                if let Some(&current_parent_ptr) = node_stack.last() {
765                    let current_parent = unsafe { &mut *current_parent_ptr };
766                    
767                    current_parent.children.push(XmlNodeChild::Element(XmlNode {
768                        node_type: tag_name.into(),
769                        attributes: StringPairVec::new().into(),
770                        children: Vec::new().into(),
771                    }));
772
773                    // Get pointer to the newly added child
774                    let children_len = current_parent.children.len();
775                    if let Some(XmlNodeChild::Element(ref mut new_child)) = current_parent.children.as_mut().get_mut(children_len - 1) {
776                        node_stack.push(new_child as *mut XmlNode);
777                    }
778                    
779                    last_was_void = is_void_element;
780                }
781            }
782            ElementEnd { end: Empty, .. } => {
783                // Pop hierarchy for all elements (including void elements after their attributes)
784                if node_stack.len() > 1 {
785                    node_stack.pop();
786                }
787                last_was_void = false;
788            }
789            ElementEnd {
790                end: Close(_, close_value),
791                ..
792            } => {
793                // HTML5-lite: If last element was a void element, pop it first
794                if last_was_void {
795                    node_stack.pop();
796                    last_was_void = false;
797                }
798
799                // HTML5-lite: Check if this is a void element - if so, ignore the closing tag
800                let is_void_element = VOID_ELEMENTS.contains(&close_value.as_str());
801                if is_void_element {
802                    // Void elements shouldn't have closing tags, but tolerate them
803                    continue;
804                }
805
806                // HTML5-lite: Auto-close any elements that should be closed
807                // Walk up the hierarchy and auto-close elements until we find a match
808                let close_value_str = close_value.as_str();
809
810                // Find matching element in stack (skip root at index 0)
811                let mut found_idx = None;
812                for i in (1..node_stack.len()).rev() {
813                    // SAFETY: All pointers in stack are valid
814                    let node = unsafe { &*node_stack[i] };
815                    if node.node_type.as_str() == close_value_str {
816                        found_idx = Some(i);
817                        break;
818                    }
819                }
820
821                if let Some(idx) = found_idx {
822                    // Pop all elements from current position to the matching element (inclusive)
823                    node_stack.truncate(idx);
824                }
825                // If no match found, just ignore (lenient HTML parsing)
826
827                last_was_void = false;
828            }
829            Attribute { local, value, .. } => {
830                // SAFETY: Last element in stack is valid
831                if let Some(&last_ptr) = node_stack.last() {
832                    let last = unsafe { &mut *last_ptr };
833                    // NOTE: Only lowercase the key ("local"), not the value!
834                    // Decode XML entities in attribute values as well
835                    last.attributes.push(azul_core::window::AzStringPair {
836                        key: local.to_string().into(),
837                        value: azul_css::AzString::from(&*decode_xml_entities(value.as_str())),
838                    });
839                }
840            }
841            Text { text } => {
842                // HTML5-lite: If last element was a void element, pop it before adding text
843                if last_was_void {
844                    node_stack.pop();
845                    last_was_void = false;
846                }
847
848                // IMPORTANT: Preserve ALL text nodes including whitespace-only nodes.
849                // Whether whitespace is significant depends on the CSS `white-space` property,
850                // which is determined during layout, not during parsing.
851                // 
852                // For example: <pre><span>    </span></pre> must preserve the 4 spaces.
853                // 
854                // We only skip completely EMPTY text nodes (zero-length strings).
855                let text_str = text.as_str();
856
857                if !text_str.is_empty() {
858                    // SAFETY: Last element in stack is valid
859                    if let Some(&current_parent_ptr) = node_stack.last() {
860                        let current_parent = unsafe { &mut *current_parent_ptr };
861                        // Decode XML entities (e.g., &lt; -> <, &gt; -> >, etc.)
862                        let decoded_text = decode_xml_entities(text_str);
863                        // Add text as a child node
864                        current_parent
865                            .children
866                            .push(XmlNodeChild::Text(azul_css::AzString::from(&*decoded_text)));
867                    }
868                }
869            }
870            _ => {}
871        }
872    }
873
874    // Clean up: if we ended with a void element, pop it
875    if last_was_void {
876        node_stack.pop();
877    }
878
879    Ok(root_node.children.into())
880}
881
882#[cfg(feature = "xml")]
883pub fn parse_xml(s: &str) -> Result<Xml, XmlError> {
884    Ok(Xml {
885        root: parse_xml_string(s)?.into(),
886    })
887}
888
889#[cfg(not(feature = "xml"))]
890pub fn parse_xml(s: &str) -> Result<Xml, XmlError> {
891    Err(XmlError::NoParserAvailable)
892}
893
894// to_string(&self) -> String
895
896#[cfg(feature = "xml")]
897pub fn translate_roxmltree_expandedname<'a, 'b>(
898    e: roxmltree::ExpandedName<'a, 'b>,
899) -> XmlQualifiedName {
900    let ns: Option<AzString> = e.namespace().map(|e| e.to_string().into());
901    XmlQualifiedName {
902        local_name: e.name().to_string().into(),
903        namespace: ns.into(),
904    }
905}
906
907#[cfg(feature = "xml")]
908fn translate_roxmltree_attribute(e: roxmltree::Attribute) -> XmlQualifiedName {
909    XmlQualifiedName {
910        local_name: e.name().to_string().into(),
911        namespace: e.namespace().map(|e| e.to_string().into()).into(),
912    }
913}
914
915#[cfg(feature = "xml")]
916fn translate_xmlparser_streamerror(e: xmlparser::StreamError) -> XmlStreamError {
917    match e {
918        xmlparser::StreamError::UnexpectedEndOfStream => XmlStreamError::UnexpectedEndOfStream,
919        xmlparser::StreamError::InvalidName => XmlStreamError::InvalidName,
920        xmlparser::StreamError::InvalidReference => XmlStreamError::InvalidReference,
921        xmlparser::StreamError::InvalidExternalID => XmlStreamError::InvalidExternalID,
922        xmlparser::StreamError::InvalidCommentData => XmlStreamError::InvalidCommentData,
923        xmlparser::StreamError::InvalidCommentEnd => XmlStreamError::InvalidCommentEnd,
924        xmlparser::StreamError::InvalidCharacterData => XmlStreamError::InvalidCharacterData,
925        xmlparser::StreamError::NonXmlChar(c, tp) => XmlStreamError::NonXmlChar(NonXmlCharError {
926            ch: c.into(),
927            pos: translate_xmlparser_textpos(tp),
928        }),
929        xmlparser::StreamError::InvalidChar(a, b, tp) => {
930            XmlStreamError::InvalidChar(InvalidCharError {
931                expected: a,
932                got: b,
933                pos: translate_xmlparser_textpos(tp),
934            })
935        }
936        xmlparser::StreamError::InvalidCharMultiple(a, b, tp) => {
937            XmlStreamError::InvalidCharMultiple(InvalidCharMultipleError {
938                expected: a,
939                got: b.to_vec().into(),
940                pos: translate_xmlparser_textpos(tp),
941            })
942        }
943        xmlparser::StreamError::InvalidQuote(a, tp) => {
944            XmlStreamError::InvalidQuote(InvalidQuoteError {
945                got: a.into(),
946                pos: translate_xmlparser_textpos(tp),
947            })
948        }
949        xmlparser::StreamError::InvalidSpace(a, tp) => {
950            XmlStreamError::InvalidSpace(InvalidSpaceError {
951                got: a.into(),
952                pos: translate_xmlparser_textpos(tp),
953            })
954        }
955        xmlparser::StreamError::InvalidString(a, tp) => {
956            XmlStreamError::InvalidString(InvalidStringError {
957                got: a.to_string().into(),
958                pos: translate_xmlparser_textpos(tp),
959            })
960        }
961    }
962}
963
964#[cfg(feature = "xml")]
965fn translate_xmlparser_error(e: xmlparser::Error) -> XmlParseError {
966    match e {
967        xmlparser::Error::InvalidDeclaration(se, tp) => {
968            XmlParseError::InvalidDeclaration(XmlTextError {
969                stream_error: translate_xmlparser_streamerror(se),
970                pos: translate_xmlparser_textpos(tp),
971            })
972        }
973        xmlparser::Error::InvalidComment(se, tp) => XmlParseError::InvalidComment(XmlTextError {
974            stream_error: translate_xmlparser_streamerror(se),
975            pos: translate_xmlparser_textpos(tp),
976        }),
977        xmlparser::Error::InvalidPI(se, tp) => XmlParseError::InvalidPI(XmlTextError {
978            stream_error: translate_xmlparser_streamerror(se),
979            pos: translate_xmlparser_textpos(tp),
980        }),
981        xmlparser::Error::InvalidDoctype(se, tp) => XmlParseError::InvalidDoctype(XmlTextError {
982            stream_error: translate_xmlparser_streamerror(se),
983            pos: translate_xmlparser_textpos(tp),
984        }),
985        xmlparser::Error::InvalidEntity(se, tp) => XmlParseError::InvalidEntity(XmlTextError {
986            stream_error: translate_xmlparser_streamerror(se),
987            pos: translate_xmlparser_textpos(tp),
988        }),
989        xmlparser::Error::InvalidElement(se, tp) => XmlParseError::InvalidElement(XmlTextError {
990            stream_error: translate_xmlparser_streamerror(se),
991            pos: translate_xmlparser_textpos(tp),
992        }),
993        xmlparser::Error::InvalidAttribute(se, tp) => {
994            XmlParseError::InvalidAttribute(XmlTextError {
995                stream_error: translate_xmlparser_streamerror(se),
996                pos: translate_xmlparser_textpos(tp),
997            })
998        }
999        xmlparser::Error::InvalidCdata(se, tp) => XmlParseError::InvalidCdata(XmlTextError {
1000            stream_error: translate_xmlparser_streamerror(se),
1001            pos: translate_xmlparser_textpos(tp),
1002        }),
1003        xmlparser::Error::InvalidCharData(se, tp) => XmlParseError::InvalidCharData(XmlTextError {
1004            stream_error: translate_xmlparser_streamerror(se),
1005            pos: translate_xmlparser_textpos(tp),
1006        }),
1007        xmlparser::Error::UnknownToken(tp) => {
1008            XmlParseError::UnknownToken(translate_xmlparser_textpos(tp))
1009        }
1010    }
1011}
1012
1013#[cfg(feature = "xml")]
1014pub fn translate_roxmltree_error(e: roxmltree::Error) -> XmlError {
1015    match e {
1016        roxmltree::Error::InvalidXmlPrefixUri(s) => {
1017            XmlError::InvalidXmlPrefixUri(translate_roxml_textpos(s))
1018        }
1019        roxmltree::Error::UnexpectedXmlUri(s) => {
1020            XmlError::UnexpectedXmlUri(translate_roxml_textpos(s))
1021        }
1022        roxmltree::Error::UnexpectedXmlnsUri(s) => {
1023            XmlError::UnexpectedXmlnsUri(translate_roxml_textpos(s))
1024        }
1025        roxmltree::Error::InvalidElementNamePrefix(s) => {
1026            XmlError::InvalidElementNamePrefix(translate_roxml_textpos(s))
1027        }
1028        roxmltree::Error::DuplicatedNamespace(s, tp) => {
1029            XmlError::DuplicatedNamespace(DuplicatedNamespaceError {
1030                ns: s.into(),
1031                pos: translate_roxml_textpos(tp),
1032            })
1033        }
1034        roxmltree::Error::UnknownNamespace(s, tp) => {
1035            XmlError::UnknownNamespace(UnknownNamespaceError {
1036                ns: s.into(),
1037                pos: translate_roxml_textpos(tp),
1038            })
1039        }
1040        roxmltree::Error::UnexpectedCloseTag(expected, actual, pos) => {
1041            XmlError::UnexpectedCloseTag(UnexpectedCloseTagError {
1042                expected: expected.into(),
1043                actual: actual.into(),
1044                pos: translate_roxml_textpos(pos),
1045            })
1046        }
1047        roxmltree::Error::UnexpectedEntityCloseTag(s) => {
1048            XmlError::UnexpectedEntityCloseTag(translate_roxml_textpos(s))
1049        }
1050        roxmltree::Error::UnknownEntityReference(s, tp) => {
1051            XmlError::UnknownEntityReference(UnknownEntityReferenceError {
1052                entity: s.into(),
1053                pos: translate_roxml_textpos(tp),
1054            })
1055        }
1056        roxmltree::Error::MalformedEntityReference(s) => {
1057            XmlError::MalformedEntityReference(translate_roxml_textpos(s))
1058        }
1059        roxmltree::Error::EntityReferenceLoop(s) => {
1060            XmlError::EntityReferenceLoop(translate_roxml_textpos(s))
1061        }
1062        roxmltree::Error::InvalidAttributeValue(s) => {
1063            XmlError::InvalidAttributeValue(translate_roxml_textpos(s))
1064        }
1065        roxmltree::Error::DuplicatedAttribute(s, tp) => {
1066            XmlError::DuplicatedAttribute(DuplicatedAttributeError {
1067                attribute: s.into(),
1068                pos: translate_roxml_textpos(tp),
1069            })
1070        }
1071        roxmltree::Error::NoRootNode => XmlError::NoRootNode,
1072        roxmltree::Error::DtdDetected => XmlError::DtdDetected,
1073        roxmltree::Error::UnclosedRootNode => XmlError::UnclosedRootNode,
1074        roxmltree::Error::UnexpectedDeclaration(tp) => {
1075            XmlError::UnexpectedDeclaration(translate_roxml_textpos(tp))
1076        }
1077        roxmltree::Error::NodesLimitReached => XmlError::NodesLimitReached,
1078        roxmltree::Error::AttributesLimitReached => XmlError::AttributesLimitReached,
1079        roxmltree::Error::NamespacesLimitReached => XmlError::NamespacesLimitReached,
1080        roxmltree::Error::InvalidName(tp) => XmlError::InvalidName(translate_roxml_textpos(tp)),
1081        roxmltree::Error::NonXmlChar(_, tp) => XmlError::NonXmlChar(translate_roxml_textpos(tp)),
1082        roxmltree::Error::InvalidChar(_, _, tp) => {
1083            XmlError::InvalidChar(translate_roxml_textpos(tp))
1084        }
1085        roxmltree::Error::InvalidChar2(_, _, tp) => {
1086            XmlError::InvalidChar2(translate_roxml_textpos(tp))
1087        }
1088        roxmltree::Error::InvalidString(_, tp) => {
1089            XmlError::InvalidString(translate_roxml_textpos(tp))
1090        }
1091        roxmltree::Error::InvalidExternalID(tp) => {
1092            XmlError::InvalidExternalID(translate_roxml_textpos(tp))
1093        }
1094        roxmltree::Error::InvalidComment(tp) => {
1095            XmlError::InvalidComment(translate_roxml_textpos(tp))
1096        }
1097        roxmltree::Error::InvalidCharacterData(tp) => {
1098            XmlError::InvalidCharacterData(translate_roxml_textpos(tp))
1099        }
1100        roxmltree::Error::UnknownToken(tp) => XmlError::UnknownToken(translate_roxml_textpos(tp)),
1101        roxmltree::Error::UnexpectedEndOfStream => XmlError::UnexpectedEndOfStream,
1102        roxmltree::Error::EntityResolver(tp, s) => {
1103            // New in roxmltree 0.21: EntityResolver error variant
1104            // For now, treat as a generic entity reference error
1105            XmlError::UnknownEntityReference(UnknownEntityReferenceError {
1106                entity: s.into(),
1107                pos: translate_roxml_textpos(tp),
1108            })
1109        }
1110    }
1111}
1112
1113#[cfg(feature = "xml")]
1114#[inline(always)]
1115const fn translate_xmlparser_textpos(o: xmlparser::TextPos) -> XmlTextPos {
1116    XmlTextPos {
1117        row: o.row,
1118        col: o.col,
1119    }
1120}
1121
1122#[cfg(feature = "xml")]
1123#[inline(always)]
1124const fn translate_roxml_textpos(o: roxmltree::TextPos) -> XmlTextPos {
1125    XmlTextPos {
1126        row: o.row,
1127        col: o.col,
1128    }
1129}
1130
1131/// Extension trait to add XML parsing capabilities to Dom
1132///
1133/// This trait provides methods to parse XML/XHTML strings and convert them
1134/// into Azul DOM trees. It's implemented as a trait to avoid circular dependencies
1135/// between azul-core and azul-layout.
1136#[cfg(feature = "xml")]
1137pub trait DomXmlExt {
1138    /// Parse XML/XHTML string into a DOM tree
1139    ///
1140    /// This method parses the XML string and converts it to an Azul StyledDom.
1141    /// On error, it returns a StyledDom displaying the error message.
1142    ///
1143    /// # Arguments
1144    /// * `xml` - The XML/XHTML string to parse
1145    ///
1146    /// # Returns
1147    /// A `StyledDom` tree representing the parsed XML, or an error DOM on parse failure
1148    fn from_xml_string<S: AsRef<str>>(xml: S) -> StyledDom;
1149}
1150
1151#[cfg(feature = "xml")]
1152impl DomXmlExt for Dom {
1153    fn from_xml_string<S: AsRef<str>>(xml: S) -> StyledDom {
1154        let component_map = ComponentMap::with_builtin();
1155        let dom_xml = domxml_from_str(xml.as_ref(), &component_map);
1156        dom_xml.parsed_dom
1157    }
1158}