facet_html/
parser.rs

1extern crate alloc;
2
3use alloc::borrow::Cow;
4use alloc::string::String;
5use alloc::vec::Vec;
6use core::fmt;
7
8use facet_format::{
9    ContainerKind, FieldEvidence, FieldKey, FieldLocationHint, FormatParser, ParseEvent,
10    ProbeStream, ScalarValue,
11};
12use html5gum::{Token, Tokenizer};
13
14/// HTML parser implementing the `FormatParser` trait.
15///
16/// This parser builds a tree of HTML elements from the tokenizer output,
17/// then emits ParseEvents from the tree structure.
18pub struct HtmlParser<'de> {
19    events: Vec<ParseEvent<'de>>,
20    idx: usize,
21    pending_error: Option<HtmlError>,
22}
23
24impl<'de> HtmlParser<'de> {
25    /// Create a new HTML parser from input bytes.
26    pub fn new(input: &'de [u8]) -> Self {
27        match build_events(input) {
28            Ok(events) => Self {
29                events,
30                idx: 0,
31                pending_error: None,
32            },
33            Err(err) => Self {
34                events: Vec::new(),
35                idx: 0,
36                pending_error: Some(err),
37            },
38        }
39    }
40}
41
42/// Error type for HTML parsing.
43#[derive(Debug, Clone)]
44pub enum HtmlError {
45    /// General parse error with message.
46    ParseError(String),
47    /// Unexpected end of input.
48    UnexpectedEof,
49    /// Invalid UTF-8 in input.
50    InvalidUtf8,
51}
52
53impl fmt::Display for HtmlError {
54    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
55        match self {
56            HtmlError::ParseError(msg) => write!(f, "HTML parse error: {}", msg),
57            HtmlError::UnexpectedEof => write!(f, "Unexpected end of HTML"),
58            HtmlError::InvalidUtf8 => write!(f, "Invalid UTF-8 in HTML"),
59        }
60    }
61}
62
63impl std::error::Error for HtmlError {}
64
65impl<'de> FormatParser<'de> for HtmlParser<'de> {
66    type Error = HtmlError;
67    type Probe<'a>
68        = HtmlProbe<'de>
69    where
70        Self: 'a;
71
72    fn next_event(&mut self) -> Result<Option<ParseEvent<'de>>, Self::Error> {
73        if let Some(err) = &self.pending_error {
74            return Err(err.clone());
75        }
76        if self.idx >= self.events.len() {
77            return Ok(None);
78        }
79        let event = self.events[self.idx].clone();
80        self.idx += 1;
81        Ok(Some(event))
82    }
83
84    fn peek_event(&mut self) -> Result<Option<ParseEvent<'de>>, Self::Error> {
85        if let Some(err) = &self.pending_error {
86            return Err(err.clone());
87        }
88        Ok(self.events.get(self.idx).cloned())
89    }
90
91    fn skip_value(&mut self) -> Result<(), Self::Error> {
92        let mut struct_depth = 0usize;
93        let mut pending_field_value = false;
94
95        loop {
96            let event = self.next_event()?.ok_or(HtmlError::UnexpectedEof)?;
97            match event {
98                ParseEvent::StructStart(_) | ParseEvent::SequenceStart(_) => {
99                    pending_field_value = false;
100                    struct_depth += 1;
101                }
102                ParseEvent::StructEnd | ParseEvent::SequenceEnd => {
103                    if struct_depth == 0 {
104                        break;
105                    } else {
106                        struct_depth -= 1;
107                        if struct_depth == 0 && !pending_field_value {
108                            break;
109                        }
110                    }
111                }
112                ParseEvent::Scalar(_) | ParseEvent::VariantTag(_) => {
113                    if struct_depth == 0 && !pending_field_value {
114                        break;
115                    }
116                    pending_field_value = false;
117                }
118                ParseEvent::FieldKey(_) | ParseEvent::OrderedField => {
119                    pending_field_value = true;
120                }
121            }
122        }
123        Ok(())
124    }
125
126    fn begin_probe(&mut self) -> Result<Self::Probe<'_>, Self::Error> {
127        let evidence = self.build_probe();
128        Ok(HtmlProbe { evidence, idx: 0 })
129    }
130}
131
132impl<'de> HtmlParser<'de> {
133    /// Build field evidence by looking ahead at remaining events.
134    fn build_probe(&self) -> Vec<FieldEvidence<'de>> {
135        let mut evidence = Vec::new();
136
137        if self.idx >= self.events.len() {
138            return evidence;
139        }
140
141        if !matches!(
142            self.events.get(self.idx),
143            Some(ParseEvent::StructStart(ContainerKind::Element))
144        ) {
145            return evidence;
146        }
147
148        let mut i = self.idx + 1;
149        let mut depth = 0usize;
150
151        while i < self.events.len() {
152            match &self.events[i] {
153                ParseEvent::StructStart(_) | ParseEvent::SequenceStart(_) => {
154                    depth += 1;
155                    i += 1;
156                }
157                ParseEvent::StructEnd | ParseEvent::SequenceEnd => {
158                    if depth == 0 {
159                        break;
160                    }
161                    depth -= 1;
162                    i += 1;
163                }
164                ParseEvent::FieldKey(key) if depth == 0 => {
165                    let scalar_value = if let Some(next_event) = self.events.get(i + 1) {
166                        match next_event {
167                            ParseEvent::Scalar(sv) => Some(sv.clone()),
168                            _ => None,
169                        }
170                    } else {
171                        None
172                    };
173
174                    if let Some(sv) = scalar_value {
175                        evidence.push(FieldEvidence::with_scalar_value(
176                            key.name.clone(),
177                            key.location,
178                            None,
179                            sv,
180                            key.namespace.clone(),
181                        ));
182                    } else {
183                        evidence.push(FieldEvidence::new(
184                            key.name.clone(),
185                            key.location,
186                            None,
187                            key.namespace.clone(),
188                        ));
189                    }
190                    i += 1;
191                }
192                _ => {
193                    i += 1;
194                }
195            }
196        }
197
198        evidence
199    }
200}
201
202/// Probe stream for HTML evidence collection.
203pub struct HtmlProbe<'de> {
204    evidence: Vec<FieldEvidence<'de>>,
205    idx: usize,
206}
207
208impl<'de> ProbeStream<'de> for HtmlProbe<'de> {
209    type Error = HtmlError;
210
211    fn next(&mut self) -> Result<Option<FieldEvidence<'de>>, Self::Error> {
212        if self.idx >= self.evidence.len() {
213            Ok(None)
214        } else {
215            let ev = self.evidence[self.idx].clone();
216            self.idx += 1;
217            Ok(Some(ev))
218        }
219    }
220}
221
222/// A child node in the DOM tree - either text or an element.
223#[derive(Debug, Clone)]
224enum ChildNode {
225    /// A text node.
226    Text(String),
227    /// An element node.
228    Element(Element),
229}
230
231/// An HTML element in the DOM tree.
232#[derive(Debug, Clone)]
233struct Element {
234    /// Tag name (lowercase).
235    name: String,
236    /// Attributes as (name, value) pairs.
237    attributes: Vec<(String, String)>,
238    /// Child nodes (text and elements interleaved, preserving order).
239    children: Vec<ChildNode>,
240}
241
242impl Element {
243    fn new(name: String, attributes: Vec<(String, String)>) -> Self {
244        Self {
245            name,
246            attributes,
247            children: Vec::new(),
248        }
249    }
250
251    fn push_text(&mut self, text: &str) {
252        // Normalize whitespace: collapse multiple whitespace to single space
253        let trimmed = text.trim();
254        if trimmed.is_empty() {
255            return;
256        }
257        // If the last child is a text node, append to it with a space separator
258        if let Some(ChildNode::Text(existing)) = self.children.last_mut() {
259            existing.push(' ');
260            existing.push_str(trimmed);
261        } else {
262            self.children.push(ChildNode::Text(trimmed.to_string()));
263        }
264    }
265
266    fn push_child(&mut self, child: Element) {
267        self.children.push(ChildNode::Element(child));
268    }
269}
270
271/// HTML void elements that cannot have children.
272const VOID_ELEMENTS: &[&str] = &[
273    "area", "base", "br", "col", "embed", "hr", "img", "input", "link", "meta", "param", "source",
274    "track", "wbr",
275];
276
277fn is_void_element(name: &str) -> bool {
278    VOID_ELEMENTS.contains(&name.to_ascii_lowercase().as_str())
279}
280
281/// Build ParseEvents from HTML input.
282fn build_events<'de>(input: &'de [u8]) -> Result<Vec<ParseEvent<'de>>, HtmlError> {
283    let input_str = core::str::from_utf8(input).map_err(|_| HtmlError::InvalidUtf8)?;
284
285    let tokenizer = Tokenizer::new(input_str);
286    let mut stack: Vec<Element> = Vec::new();
287    let mut roots: Vec<Element> = Vec::new();
288    let mut doctype_name: Option<String> = None;
289
290    for token_result in tokenizer {
291        let token = token_result.map_err(|_| HtmlError::ParseError("tokenizer error".into()))?;
292
293        match token {
294            Token::StartTag(tag) => {
295                let name = String::from_utf8_lossy(&tag.name).to_ascii_lowercase();
296                let attributes: Vec<(String, String)> = tag
297                    .attributes
298                    .iter()
299                    .map(|(k, v)| {
300                        (
301                            String::from_utf8_lossy(k).into_owned(),
302                            String::from_utf8_lossy(v).into_owned(),
303                        )
304                    })
305                    .collect();
306
307                let elem = Element::new(name.clone(), attributes);
308
309                if tag.self_closing || is_void_element(&name) {
310                    // Self-closing or void element - attach immediately
311                    attach_element(&mut stack, elem, &mut roots);
312                } else {
313                    // Push onto stack to collect children
314                    stack.push(elem);
315                }
316            }
317            Token::EndTag(tag) => {
318                let end_name = String::from_utf8_lossy(&tag.name).to_ascii_lowercase();
319
320                // Pop elements until we find a matching start tag
321                // This handles malformed HTML gracefully
322                while let Some(elem) = stack.pop() {
323                    if elem.name == end_name {
324                        attach_element(&mut stack, elem, &mut roots);
325                        break;
326                    } else {
327                        // Implicitly close this element (HTML error recovery)
328                        attach_element(&mut stack, elem, &mut roots);
329                    }
330                }
331            }
332            Token::String(text) => {
333                let text_str = String::from_utf8_lossy(&text);
334                if let Some(current) = stack.last_mut() {
335                    current.push_text(&text_str);
336                }
337                // Text outside elements is ignored
338            }
339            Token::Doctype(doctype) => {
340                // Capture the DOCTYPE name (e.g., "html" for <!DOCTYPE html>)
341                let name = String::from_utf8_lossy(&doctype.name).to_ascii_lowercase();
342                if !name.is_empty() {
343                    doctype_name = Some(name);
344                }
345            }
346            Token::Comment(_) | Token::Error(_) => {
347                // Ignore comments and errors
348            }
349        }
350    }
351
352    // Close any remaining open elements
353    while let Some(elem) = stack.pop() {
354        attach_element(&mut stack, elem, &mut roots);
355    }
356
357    // If we have a doctype and the root is an html element, inject it as a pseudo-attribute
358    if let Some(ref doctype) = doctype_name
359        && roots.len() == 1
360        && roots[0].name == "html"
361    {
362        // Insert doctype as the first attribute
363        roots[0]
364            .attributes
365            .insert(0, ("doctype".to_string(), doctype.clone()));
366    }
367
368    // Generate events from the tree
369    let mut events = Vec::new();
370
371    if roots.is_empty() {
372        // Empty document
373        events.push(ParseEvent::StructStart(ContainerKind::Element));
374        events.push(ParseEvent::StructEnd);
375    } else if roots.len() == 1 {
376        // Single root element
377        emit_element_events(&roots[0], &mut events);
378    } else {
379        // Multiple roots - wrap in a virtual document element
380        events.push(ParseEvent::StructStart(ContainerKind::Element));
381        for root in &roots {
382            let key = FieldKey::new(Cow::Owned(root.name.clone()), FieldLocationHint::Child);
383            events.push(ParseEvent::FieldKey(key));
384            emit_element_events(root, &mut events);
385        }
386        events.push(ParseEvent::StructEnd);
387    }
388
389    Ok(events)
390}
391
392/// Attach an element to its parent or to the roots list.
393fn attach_element(stack: &mut [Element], elem: Element, roots: &mut Vec<Element>) {
394    if let Some(parent) = stack.last_mut() {
395        parent.push_child(elem);
396    } else {
397        roots.push(elem);
398    }
399}
400
401/// Emit ParseEvents from an Element.
402fn emit_element_events<'de>(elem: &Element, events: &mut Vec<ParseEvent<'de>>) {
403    let has_attrs = !elem.attributes.is_empty();
404    let has_children = !elem.children.is_empty();
405
406    // Case 1: No attributes, no children - emit struct with just _tag
407    if !has_attrs && !has_children {
408        events.push(ParseEvent::StructStart(ContainerKind::Element));
409        // Always emit _tag so custom elements can capture the tag name
410        let key = FieldKey::new(Cow::Borrowed("_tag"), FieldLocationHint::Tag);
411        events.push(ParseEvent::FieldKey(key));
412        events.push(ParseEvent::Scalar(ScalarValue::Str(Cow::Owned(
413            elem.name.clone(),
414        ))));
415        events.push(ParseEvent::StructEnd);
416        return;
417    }
418
419    // Case 2: Has attributes or children - emit as struct with _text children
420    // The deserializer handles grouping repeated field names into sequences.
421    events.push(ParseEvent::StructStart(ContainerKind::Element));
422
423    // Always emit _tag first so custom elements can capture the tag name
424    let key = FieldKey::new(Cow::Borrowed("_tag"), FieldLocationHint::Tag);
425    events.push(ParseEvent::FieldKey(key));
426    events.push(ParseEvent::Scalar(ScalarValue::Str(Cow::Owned(
427        elem.name.clone(),
428    ))));
429
430    // Emit attributes as fields
431    for (name, value) in &elem.attributes {
432        let key = FieldKey::new(Cow::Owned(name.clone()), FieldLocationHint::Attribute);
433        events.push(ParseEvent::FieldKey(key));
434        events.push(ParseEvent::Scalar(ScalarValue::Str(Cow::Owned(
435            value.clone(),
436        ))));
437    }
438
439    // Emit children in order (preserving interleaved text/element ordering)
440    for child in &elem.children {
441        match child {
442            ChildNode::Text(text) => {
443                let key = FieldKey::new(Cow::Borrowed("_text"), FieldLocationHint::Text);
444                events.push(ParseEvent::FieldKey(key));
445                events.push(ParseEvent::Scalar(ScalarValue::Str(Cow::Owned(
446                    text.clone(),
447                ))));
448            }
449            ChildNode::Element(child_elem) => {
450                let key = FieldKey::new(
451                    Cow::Owned(child_elem.name.clone()),
452                    FieldLocationHint::Child,
453                );
454                events.push(ParseEvent::FieldKey(key));
455                emit_element_events(child_elem, events);
456            }
457        }
458    }
459
460    events.push(ParseEvent::StructEnd);
461}
462
463#[cfg(test)]
464mod tests {
465    use super::*;
466    use facet::Facet;
467    use facet_format::FormatDeserializer;
468
469    #[test]
470    fn test_simple_element() {
471        let html = b"<div>hello</div>";
472        let events = build_events(html).unwrap();
473        // Elements now emit _tag first, then _text for content
474        assert_eq!(
475            events,
476            vec![
477                ParseEvent::StructStart(ContainerKind::Element),
478                ParseEvent::FieldKey(FieldKey::new(Cow::Borrowed("_tag"), FieldLocationHint::Tag)),
479                ParseEvent::Scalar(ScalarValue::Str(Cow::Owned("div".into()))),
480                ParseEvent::FieldKey(FieldKey::new(
481                    Cow::Borrowed("_text"),
482                    FieldLocationHint::Text
483                )),
484                ParseEvent::Scalar(ScalarValue::Str(Cow::Owned("hello".into()))),
485                ParseEvent::StructEnd,
486            ]
487        );
488    }
489
490    #[test]
491    fn test_element_with_attribute() {
492        let html = b"<div class=\"foo\">hello</div>";
493        let events = build_events(html).unwrap();
494        assert_eq!(
495            events,
496            vec![
497                ParseEvent::StructStart(ContainerKind::Element),
498                ParseEvent::FieldKey(FieldKey::new(Cow::Borrowed("_tag"), FieldLocationHint::Tag)),
499                ParseEvent::Scalar(ScalarValue::Str(Cow::Owned("div".into()))),
500                ParseEvent::FieldKey(FieldKey::new(
501                    Cow::Owned("class".into()),
502                    FieldLocationHint::Attribute
503                )),
504                ParseEvent::Scalar(ScalarValue::Str(Cow::Owned("foo".into()))),
505                ParseEvent::FieldKey(FieldKey::new(
506                    Cow::Owned("_text".into()),
507                    FieldLocationHint::Text
508                )),
509                ParseEvent::Scalar(ScalarValue::Str(Cow::Owned("hello".into()))),
510                ParseEvent::StructEnd,
511            ]
512        );
513    }
514
515    #[test]
516    fn test_nested_elements() {
517        let html = b"<div><span>inner</span></div>";
518        let events = build_events(html).unwrap();
519        // Nested elements now emit _tag, then child elements with their _tag
520        assert_eq!(
521            events,
522            vec![
523                ParseEvent::StructStart(ContainerKind::Element),
524                ParseEvent::FieldKey(FieldKey::new(Cow::Borrowed("_tag"), FieldLocationHint::Tag)),
525                ParseEvent::Scalar(ScalarValue::Str(Cow::Owned("div".into()))),
526                ParseEvent::FieldKey(FieldKey::new(
527                    Cow::Owned("span".into()),
528                    FieldLocationHint::Child
529                )),
530                ParseEvent::StructStart(ContainerKind::Element),
531                ParseEvent::FieldKey(FieldKey::new(Cow::Borrowed("_tag"), FieldLocationHint::Tag)),
532                ParseEvent::Scalar(ScalarValue::Str(Cow::Owned("span".into()))),
533                ParseEvent::FieldKey(FieldKey::new(
534                    Cow::Borrowed("_text"),
535                    FieldLocationHint::Text
536                )),
537                ParseEvent::Scalar(ScalarValue::Str(Cow::Owned("inner".into()))),
538                ParseEvent::StructEnd,
539                ParseEvent::StructEnd,
540            ]
541        );
542    }
543
544    #[test]
545    fn test_void_element() {
546        let html = b"<div><br><span>after</span></div>";
547        let events = build_events(html).unwrap();
548        // br is a void element, should be parsed correctly
549        assert!(!events.is_empty());
550    }
551
552    #[test]
553    fn test_deserialize_simple_struct() {
554        #[derive(Debug, Facet, PartialEq)]
555        struct Div {
556            #[facet(default)]
557            class: Option<String>,
558        }
559
560        let html = b"<div class=\"container\"></div>";
561        let parser = HtmlParser::new(html);
562        let mut deserializer = FormatDeserializer::new(parser);
563        let result: Div = deserializer.deserialize().unwrap();
564        assert_eq!(result.class, Some("container".into()));
565    }
566
567    #[test]
568    fn test_deserialize_nested() {
569        use facet_xml as xml;
570
571        #[derive(Debug, Facet, PartialEq)]
572        struct Outer {
573            #[facet(default)]
574            inner: Option<Inner>,
575        }
576
577        #[derive(Debug, Facet, PartialEq)]
578        struct Inner {
579            #[facet(default)]
580            value: Option<Value>,
581        }
582
583        #[derive(Debug, Facet, PartialEq)]
584        struct Value {
585            #[facet(xml::text, default)]
586            text: String,
587        }
588
589        let html = b"<outer><inner><value>hello</value></inner></outer>";
590        let parser = HtmlParser::new(html);
591        let mut deserializer = FormatDeserializer::new(parser);
592        let result: Outer = deserializer.deserialize().unwrap();
593        assert_eq!(
594            result,
595            Outer {
596                inner: Some(Inner {
597                    value: Some(Value {
598                        text: "hello".into()
599                    })
600                })
601            }
602        );
603    }
604
605    #[test]
606    fn test_deserialize_with_text() {
607        use facet_xml as xml;
608
609        #[derive(Debug, Facet, PartialEq)]
610        struct Article {
611            #[facet(default)]
612            title: Option<TitleElement>,
613            #[facet(default)]
614            content: Option<ContentElement>,
615        }
616
617        #[derive(Debug, Facet, PartialEq)]
618        struct TitleElement {
619            #[facet(xml::text, default)]
620            text: String,
621        }
622
623        #[derive(Debug, Facet, PartialEq)]
624        struct ContentElement {
625            #[facet(xml::text, default)]
626            text: String,
627        }
628
629        let html = b"<article><title>Hello</title><content>World</content></article>";
630        let parser = HtmlParser::new(html);
631        let mut deserializer = FormatDeserializer::new(parser);
632        let result: Article = deserializer.deserialize().unwrap();
633        assert_eq!(
634            result,
635            Article {
636                title: Some(TitleElement {
637                    text: "Hello".into()
638                }),
639                content: Some(ContentElement {
640                    text: "World".into()
641                })
642            }
643        );
644    }
645
646    #[test]
647    fn test_deserialize_multiple_attributes() {
648        #[derive(Debug, Facet, PartialEq)]
649        struct Link {
650            #[facet(default)]
651            href: Option<String>,
652            #[facet(default)]
653            target: Option<String>,
654            #[facet(default)]
655            rel: Option<String>,
656        }
657
658        let html = b"<a href=\"https://example.com\" target=\"_blank\" rel=\"noopener\"></a>";
659        let parser = HtmlParser::new(html);
660        let mut deserializer = FormatDeserializer::new(parser);
661        let result: Link = deserializer.deserialize().unwrap();
662        assert_eq!(
663            result,
664            Link {
665                href: Some("https://example.com".into()),
666                target: Some("_blank".into()),
667                rel: Some("noopener".into())
668            }
669        );
670    }
671
672    #[test]
673    fn test_deserialize_predefined_img() {
674        use facet_html_dom::Img;
675
676        let html = b"<img src=\"photo.jpg\" alt=\"A photo\" width=\"100\" height=\"200\">";
677        let parser = HtmlParser::new(html);
678        let mut deserializer = FormatDeserializer::new(parser);
679        let result: Img = deserializer.deserialize().unwrap();
680        assert_eq!(result.src, Some("photo.jpg".into()));
681        assert_eq!(result.alt, Some("A photo".into()));
682        assert_eq!(result.width, Some("100".into()));
683        assert_eq!(result.height, Some("200".into()));
684    }
685
686    #[test]
687    fn test_deserialize_predefined_a() {
688        use facet_html_dom::{A, PhrasingContent};
689
690        let html = b"<a href=\"https://example.com\" target=\"_blank\">Click me</a>";
691        let parser = HtmlParser::new(html);
692        let mut deserializer = FormatDeserializer::new(parser);
693        let result: A = deserializer.deserialize().unwrap();
694        assert_eq!(result.href, Some("https://example.com".into()));
695        assert_eq!(result.target, Some("_blank".into()));
696        assert_eq!(result.children.len(), 1);
697        assert!(matches!(&result.children[0], PhrasingContent::Text(t) if t == "Click me"));
698    }
699
700    #[test]
701    fn test_deserialize_predefined_div_with_class() {
702        use facet_html_dom::{Div, FlowContent};
703
704        let html = b"<div class=\"container\" id=\"main\">Hello World</div>";
705        let parser = HtmlParser::new(html);
706        let mut deserializer = FormatDeserializer::new(parser);
707        let result: Div = deserializer.deserialize().unwrap();
708        assert_eq!(result.attrs.class, Some("container".into()));
709        assert_eq!(result.attrs.id, Some("main".into()));
710        assert_eq!(result.children.len(), 1);
711        assert!(matches!(&result.children[0], FlowContent::Text(t) if t == "Hello World"));
712    }
713
714    #[test]
715    fn test_mixed_content_events() {
716        // Test: <p>Hello <strong>world</strong> there</p>
717        // Should produce events with text nodes in their correct positions
718        let html = b"<p>Hello <strong>world</strong> there</p>";
719        let events = build_events(html).unwrap();
720
721        // Should have:
722        // StructStart (p)
723        // FieldKey(_tag) -> "p"
724        // FieldKey(_text) -> "Hello"
725        // FieldKey(strong) -> StructStart, FieldKey(_tag), "strong", FieldKey(_text), "world", StructEnd
726        // FieldKey(_text) -> "there"
727        // StructEnd
728        assert_eq!(
729            events,
730            vec![
731                ParseEvent::StructStart(ContainerKind::Element),
732                ParseEvent::FieldKey(FieldKey::new(Cow::Borrowed("_tag"), FieldLocationHint::Tag)),
733                ParseEvent::Scalar(ScalarValue::Str(Cow::Owned("p".into()))),
734                ParseEvent::FieldKey(FieldKey::new(
735                    Cow::Borrowed("_text"),
736                    FieldLocationHint::Text
737                )),
738                ParseEvent::Scalar(ScalarValue::Str(Cow::Owned("Hello".into()))),
739                ParseEvent::FieldKey(FieldKey::new(
740                    Cow::Owned("strong".into()),
741                    FieldLocationHint::Child
742                )),
743                ParseEvent::StructStart(ContainerKind::Element),
744                ParseEvent::FieldKey(FieldKey::new(Cow::Borrowed("_tag"), FieldLocationHint::Tag)),
745                ParseEvent::Scalar(ScalarValue::Str(Cow::Owned("strong".into()))),
746                ParseEvent::FieldKey(FieldKey::new(
747                    Cow::Borrowed("_text"),
748                    FieldLocationHint::Text
749                )),
750                ParseEvent::Scalar(ScalarValue::Str(Cow::Owned("world".into()))),
751                ParseEvent::StructEnd,
752                ParseEvent::FieldKey(FieldKey::new(
753                    Cow::Borrowed("_text"),
754                    FieldLocationHint::Text
755                )),
756                ParseEvent::Scalar(ScalarValue::Str(Cow::Owned("there".into()))),
757                ParseEvent::StructEnd,
758            ]
759        );
760    }
761
762    #[test]
763    fn test_mixed_content_deserialization() {
764        use facet_html_dom::{P, PhrasingContent};
765
766        // Test: <p>Hello <strong>world</strong> there</p>
767        let html = b"<p>Hello <strong>world</strong> there</p>";
768        let parser = HtmlParser::new(html);
769        let mut deserializer = FormatDeserializer::new(parser);
770        let result: P = deserializer.deserialize().unwrap();
771
772        // The children should have the interleaved text and element nodes
773        assert_eq!(result.children.len(), 3);
774        assert!(matches!(&result.children[0], PhrasingContent::Text(t) if t == "Hello"));
775        // Strong now has children, not a text field
776        if let PhrasingContent::Strong(strong) = &result.children[1] {
777            assert_eq!(strong.children.len(), 1);
778            assert!(matches!(&strong.children[0], PhrasingContent::Text(t) if t == "world"));
779        } else {
780            panic!("Expected Strong element");
781        }
782        assert!(matches!(&result.children[2], PhrasingContent::Text(t) if t == "there"));
783    }
784
785    #[test]
786    fn test_mixed_content_multiple_elements() {
787        use facet_html_dom::{P, PhrasingContent};
788
789        // Test: <p>Start <strong>bold</strong> middle <em>italic</em> end</p>
790        let html = b"<p>Start <strong>bold</strong> middle <em>italic</em> end</p>";
791        let parser = HtmlParser::new(html);
792        let mut deserializer = FormatDeserializer::new(parser);
793        let result: P = deserializer.deserialize().unwrap();
794
795        assert_eq!(result.children.len(), 5);
796        assert!(matches!(&result.children[0], PhrasingContent::Text(t) if t == "Start"));
797        // Strong and Em now have children, not text fields
798        if let PhrasingContent::Strong(strong) = &result.children[1] {
799            assert_eq!(strong.children.len(), 1);
800            assert!(matches!(&strong.children[0], PhrasingContent::Text(t) if t == "bold"));
801        } else {
802            panic!("Expected Strong element");
803        }
804        assert!(matches!(&result.children[2], PhrasingContent::Text(t) if t == "middle"));
805        if let PhrasingContent::Em(em) = &result.children[3] {
806            assert_eq!(em.children.len(), 1);
807            assert!(matches!(&em.children[0], PhrasingContent::Text(t) if t == "italic"));
808        } else {
809            panic!("Expected Em element");
810        }
811        assert!(matches!(&result.children[4], PhrasingContent::Text(t) if t == "end"));
812    }
813
814    #[test]
815    fn test_deserialize_meta_charset() {
816        use facet_html_dom::Meta;
817
818        // Regression test for https://github.com/facet-rs/facet/issues/1527
819        // meta charset="utf-8" was failing with:
820        // "type mismatch: expected struct start, got Scalar(Str("utf-8"))"
821        let html = b"<meta charset=\"utf-8\">";
822        let parser = HtmlParser::new(html);
823        let mut deserializer = FormatDeserializer::new(parser);
824        let result: Meta = deserializer.deserialize().unwrap();
825        assert_eq!(result.charset, Some("utf-8".into()));
826    }
827
828    #[test]
829    fn test_deserialize_head_with_meta_charset() {
830        use facet_html_dom::Head;
831
832        // Regression test for https://github.com/facet-rs/facet/issues/1527
833        // The bug occurs when meta is inside head
834        let html = b"<head><meta charset=\"utf-8\"><title>Test</title></head>";
835        let parser = HtmlParser::new(html);
836        let mut deserializer = FormatDeserializer::new(parser);
837        let result: Head = deserializer.deserialize().unwrap();
838
839        // Head has dedicated fields for meta elements
840        assert!(!result.meta.is_empty(), "Should have a meta element");
841        assert_eq!(result.meta[0].charset, Some("utf-8".into()));
842    }
843
844    #[test]
845    fn test_deserialize_full_html_document_with_meta_charset() {
846        use facet_html_dom::Html;
847
848        // Full reproduction from https://github.com/facet-rs/facet/issues/1527
849        let html = br#"<!DOCTYPE html>
850<html>
851<head>
852    <meta charset="utf-8">
853    <title>Test Page</title>
854</head>
855<body>
856    <p>Hello</p>
857</body>
858</html>"#;
859
860        let parser = HtmlParser::new(html);
861        let mut deserializer = FormatDeserializer::new(parser);
862        let result: Html = deserializer.deserialize().unwrap();
863
864        // Verify head was parsed correctly
865        let head = result.head.as_ref().expect("Should have head");
866        assert!(!head.meta.is_empty(), "Should have meta elements");
867        assert_eq!(head.meta[0].charset, Some("utf-8".into()));
868
869        // Verify title
870        let title = head.title.as_ref().expect("Should have title");
871        assert_eq!(title.text, "Test Page");
872
873        // Verify body exists
874        assert!(result.body.is_some(), "Should have body");
875    }
876
877    #[test]
878    fn test_doctype_captured() {
879        use facet_html_dom::Html;
880
881        // Test that DOCTYPE is captured during parsing
882        let html = br#"<!DOCTYPE html>
883<html>
884<head><title>Test</title></head>
885<body></body>
886</html>"#;
887
888        let parser = HtmlParser::new(html);
889        let mut deserializer = FormatDeserializer::new(parser);
890        let result: Html = deserializer.deserialize().unwrap();
891
892        // Verify DOCTYPE was captured
893        assert_eq!(
894            result.doctype,
895            Some("html".to_string()),
896            "DOCTYPE should be captured"
897        );
898    }
899
900    #[test]
901    fn test_doctype_not_present() {
902        use facet_html_dom::Html;
903
904        // Test that DOCTYPE is None when not present
905        let html = br#"<html>
906<head><title>Test</title></head>
907<body></body>
908</html>"#;
909
910        let parser = HtmlParser::new(html);
911        let mut deserializer = FormatDeserializer::new(parser);
912        let result: Html = deserializer.deserialize().unwrap();
913
914        // Verify DOCTYPE is None
915        assert_eq!(
916            result.doctype, None,
917            "DOCTYPE should be None when not present"
918        );
919    }
920}