facet_html/
parser.rs

1extern crate alloc;
2
3use alloc::borrow::Cow;
4use alloc::string::String;
5use alloc::vec::Vec;
6use core::fmt;
7
8use facet_format::{
9    ContainerKind, FieldEvidence, FieldKey, FieldLocationHint, FormatParser, ParseEvent,
10    ProbeStream, ScalarValue,
11};
12use html5gum::{Token, Tokenizer};
13
14/// HTML parser implementing the `FormatParser` trait.
15///
16/// This parser builds a tree of HTML elements from the tokenizer output,
17/// then emits ParseEvents from the tree structure.
18pub struct HtmlParser<'de> {
19    events: Vec<ParseEvent<'de>>,
20    idx: usize,
21    pending_error: Option<HtmlError>,
22}
23
24impl<'de> HtmlParser<'de> {
25    /// Create a new HTML parser from input bytes.
26    pub fn new(input: &'de [u8]) -> Self {
27        match build_events(input) {
28            Ok(events) => Self {
29                events,
30                idx: 0,
31                pending_error: None,
32            },
33            Err(err) => Self {
34                events: Vec::new(),
35                idx: 0,
36                pending_error: Some(err),
37            },
38        }
39    }
40}
41
42/// Error type for HTML parsing.
43#[derive(Debug, Clone)]
44pub enum HtmlError {
45    /// General parse error with message.
46    ParseError(String),
47    /// Unexpected end of input.
48    UnexpectedEof,
49    /// Invalid UTF-8 in input.
50    InvalidUtf8,
51}
52
53impl fmt::Display for HtmlError {
54    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
55        match self {
56            HtmlError::ParseError(msg) => write!(f, "HTML parse error: {}", msg),
57            HtmlError::UnexpectedEof => write!(f, "Unexpected end of HTML"),
58            HtmlError::InvalidUtf8 => write!(f, "Invalid UTF-8 in HTML"),
59        }
60    }
61}
62
63impl std::error::Error for HtmlError {}
64
65impl<'de> FormatParser<'de> for HtmlParser<'de> {
66    type Error = HtmlError;
67    type Probe<'a>
68        = HtmlProbe<'de>
69    where
70        Self: 'a;
71
72    fn next_event(&mut self) -> Result<Option<ParseEvent<'de>>, Self::Error> {
73        if let Some(err) = &self.pending_error {
74            return Err(err.clone());
75        }
76        if self.idx >= self.events.len() {
77            return Ok(None);
78        }
79        let event = self.events[self.idx].clone();
80        self.idx += 1;
81        Ok(Some(event))
82    }
83
84    fn peek_event(&mut self) -> Result<Option<ParseEvent<'de>>, Self::Error> {
85        if let Some(err) = &self.pending_error {
86            return Err(err.clone());
87        }
88        Ok(self.events.get(self.idx).cloned())
89    }
90
91    fn skip_value(&mut self) -> Result<(), Self::Error> {
92        let mut struct_depth = 0usize;
93        let mut pending_field_value = false;
94
95        loop {
96            let event = self.next_event()?.ok_or(HtmlError::UnexpectedEof)?;
97            match event {
98                ParseEvent::StructStart(_) | ParseEvent::SequenceStart(_) => {
99                    pending_field_value = false;
100                    struct_depth += 1;
101                }
102                ParseEvent::StructEnd | ParseEvent::SequenceEnd => {
103                    if struct_depth == 0 {
104                        break;
105                    } else {
106                        struct_depth -= 1;
107                        if struct_depth == 0 && !pending_field_value {
108                            break;
109                        }
110                    }
111                }
112                ParseEvent::Scalar(_) | ParseEvent::VariantTag(_) => {
113                    if struct_depth == 0 && !pending_field_value {
114                        break;
115                    }
116                    pending_field_value = false;
117                }
118                ParseEvent::FieldKey(_) | ParseEvent::OrderedField => {
119                    pending_field_value = true;
120                }
121            }
122        }
123        Ok(())
124    }
125
126    fn begin_probe(&mut self) -> Result<Self::Probe<'_>, Self::Error> {
127        let evidence = self.build_probe();
128        Ok(HtmlProbe { evidence, idx: 0 })
129    }
130}
131
132impl<'de> HtmlParser<'de> {
133    /// Build field evidence by looking ahead at remaining events.
134    fn build_probe(&self) -> Vec<FieldEvidence<'de>> {
135        let mut evidence = Vec::new();
136
137        if self.idx >= self.events.len() {
138            return evidence;
139        }
140
141        if !matches!(
142            self.events.get(self.idx),
143            Some(ParseEvent::StructStart(ContainerKind::Element))
144        ) {
145            return evidence;
146        }
147
148        let mut i = self.idx + 1;
149        let mut depth = 0usize;
150
151        while i < self.events.len() {
152            match &self.events[i] {
153                ParseEvent::StructStart(_) | ParseEvent::SequenceStart(_) => {
154                    depth += 1;
155                    i += 1;
156                }
157                ParseEvent::StructEnd | ParseEvent::SequenceEnd => {
158                    if depth == 0 {
159                        break;
160                    }
161                    depth -= 1;
162                    i += 1;
163                }
164                ParseEvent::FieldKey(key) if depth == 0 => {
165                    let scalar_value = if let Some(next_event) = self.events.get(i + 1) {
166                        match next_event {
167                            ParseEvent::Scalar(sv) => Some(sv.clone()),
168                            _ => None,
169                        }
170                    } else {
171                        None
172                    };
173
174                    if let Some(sv) = scalar_value {
175                        evidence.push(FieldEvidence::with_scalar_value(
176                            key.name.clone(),
177                            key.location,
178                            None,
179                            sv,
180                            key.namespace.clone(),
181                        ));
182                    } else {
183                        evidence.push(FieldEvidence::new(
184                            key.name.clone(),
185                            key.location,
186                            None,
187                            key.namespace.clone(),
188                        ));
189                    }
190                    i += 1;
191                }
192                _ => {
193                    i += 1;
194                }
195            }
196        }
197
198        evidence
199    }
200}
201
202/// Probe stream for HTML evidence collection.
203pub struct HtmlProbe<'de> {
204    evidence: Vec<FieldEvidence<'de>>,
205    idx: usize,
206}
207
208impl<'de> ProbeStream<'de> for HtmlProbe<'de> {
209    type Error = HtmlError;
210
211    fn next(&mut self) -> Result<Option<FieldEvidence<'de>>, Self::Error> {
212        if self.idx >= self.evidence.len() {
213            Ok(None)
214        } else {
215            let ev = self.evidence[self.idx].clone();
216            self.idx += 1;
217            Ok(Some(ev))
218        }
219    }
220}
221
222/// A child node in the DOM tree - either text or an element.
223#[derive(Debug, Clone)]
224enum ChildNode {
225    /// A text node.
226    Text(String),
227    /// An element node.
228    Element(Element),
229}
230
231/// An HTML element in the DOM tree.
232#[derive(Debug, Clone)]
233struct Element {
234    /// Tag name (lowercase).
235    name: String,
236    /// Attributes as (name, value) pairs.
237    attributes: Vec<(String, String)>,
238    /// Child nodes (text and elements interleaved, preserving order).
239    children: Vec<ChildNode>,
240}
241
242impl Element {
243    fn new(name: String, attributes: Vec<(String, String)>) -> Self {
244        Self {
245            name,
246            attributes,
247            children: Vec::new(),
248        }
249    }
250
251    fn push_text(&mut self, text: &str) {
252        // Normalize whitespace: collapse multiple whitespace to single space
253        let trimmed = text.trim();
254        if trimmed.is_empty() {
255            return;
256        }
257        // If the last child is a text node, append to it with a space separator
258        if let Some(ChildNode::Text(existing)) = self.children.last_mut() {
259            existing.push(' ');
260            existing.push_str(trimmed);
261        } else {
262            self.children.push(ChildNode::Text(trimmed.to_string()));
263        }
264    }
265
266    fn push_child(&mut self, child: Element) {
267        self.children.push(ChildNode::Element(child));
268    }
269}
270
271/// HTML void elements that cannot have children.
272const VOID_ELEMENTS: &[&str] = &[
273    "area", "base", "br", "col", "embed", "hr", "img", "input", "link", "meta", "param", "source",
274    "track", "wbr",
275];
276
277fn is_void_element(name: &str) -> bool {
278    VOID_ELEMENTS.contains(&name.to_ascii_lowercase().as_str())
279}
280
281/// Build ParseEvents from HTML input.
282fn build_events<'de>(input: &'de [u8]) -> Result<Vec<ParseEvent<'de>>, HtmlError> {
283    let input_str = core::str::from_utf8(input).map_err(|_| HtmlError::InvalidUtf8)?;
284
285    let tokenizer = Tokenizer::new(input_str);
286    let mut stack: Vec<Element> = Vec::new();
287    let mut roots: Vec<Element> = Vec::new();
288
289    for token_result in tokenizer {
290        let token = token_result.map_err(|_| HtmlError::ParseError("tokenizer error".into()))?;
291
292        match token {
293            Token::StartTag(tag) => {
294                let name = String::from_utf8_lossy(&tag.name).to_ascii_lowercase();
295                let attributes: Vec<(String, String)> = tag
296                    .attributes
297                    .iter()
298                    .map(|(k, v)| {
299                        (
300                            String::from_utf8_lossy(k).into_owned(),
301                            String::from_utf8_lossy(v).into_owned(),
302                        )
303                    })
304                    .collect();
305
306                let elem = Element::new(name.clone(), attributes);
307
308                if tag.self_closing || is_void_element(&name) {
309                    // Self-closing or void element - attach immediately
310                    attach_element(&mut stack, elem, &mut roots);
311                } else {
312                    // Push onto stack to collect children
313                    stack.push(elem);
314                }
315            }
316            Token::EndTag(tag) => {
317                let end_name = String::from_utf8_lossy(&tag.name).to_ascii_lowercase();
318
319                // Pop elements until we find a matching start tag
320                // This handles malformed HTML gracefully
321                while let Some(elem) = stack.pop() {
322                    if elem.name == end_name {
323                        attach_element(&mut stack, elem, &mut roots);
324                        break;
325                    } else {
326                        // Implicitly close this element (HTML error recovery)
327                        attach_element(&mut stack, elem, &mut roots);
328                    }
329                }
330            }
331            Token::String(text) => {
332                let text_str = String::from_utf8_lossy(&text);
333                if let Some(current) = stack.last_mut() {
334                    current.push_text(&text_str);
335                }
336                // Text outside elements is ignored
337            }
338            Token::Doctype(_) | Token::Comment(_) | Token::Error(_) => {
339                // Ignore doctype, comments, and errors
340            }
341        }
342    }
343
344    // Close any remaining open elements
345    while let Some(elem) = stack.pop() {
346        attach_element(&mut stack, elem, &mut roots);
347    }
348
349    // Generate events from the tree
350    let mut events = Vec::new();
351
352    if roots.is_empty() {
353        // Empty document
354        events.push(ParseEvent::StructStart(ContainerKind::Element));
355        events.push(ParseEvent::StructEnd);
356    } else if roots.len() == 1 {
357        // Single root element
358        emit_element_events(&roots[0], &mut events);
359    } else {
360        // Multiple roots - wrap in a virtual document element
361        events.push(ParseEvent::StructStart(ContainerKind::Element));
362        for root in &roots {
363            let key = FieldKey::new(Cow::Owned(root.name.clone()), FieldLocationHint::Child);
364            events.push(ParseEvent::FieldKey(key));
365            emit_element_events(root, &mut events);
366        }
367        events.push(ParseEvent::StructEnd);
368    }
369
370    Ok(events)
371}
372
373/// Attach an element to its parent or to the roots list.
374fn attach_element(stack: &mut [Element], elem: Element, roots: &mut Vec<Element>) {
375    if let Some(parent) = stack.last_mut() {
376        parent.push_child(elem);
377    } else {
378        roots.push(elem);
379    }
380}
381
382/// Emit ParseEvents from an Element.
383fn emit_element_events<'de>(elem: &Element, events: &mut Vec<ParseEvent<'de>>) {
384    let has_attrs = !elem.attributes.is_empty();
385    let has_children = !elem.children.is_empty();
386
387    // Case 1: No attributes, no children - emit empty struct
388    if !has_attrs && !has_children {
389        events.push(ParseEvent::StructStart(ContainerKind::Element));
390        events.push(ParseEvent::StructEnd);
391        return;
392    }
393
394    // Case 2: Has attributes or children - emit as struct with _text children
395    // The deserializer handles grouping repeated field names into sequences.
396    events.push(ParseEvent::StructStart(ContainerKind::Element));
397
398    // Emit attributes as fields
399    for (name, value) in &elem.attributes {
400        let key = FieldKey::new(Cow::Owned(name.clone()), FieldLocationHint::Attribute);
401        events.push(ParseEvent::FieldKey(key));
402        events.push(ParseEvent::Scalar(ScalarValue::Str(Cow::Owned(
403            value.clone(),
404        ))));
405    }
406
407    // Emit children in order (preserving interleaved text/element ordering)
408    for child in &elem.children {
409        match child {
410            ChildNode::Text(text) => {
411                let key = FieldKey::new(Cow::Borrowed("_text"), FieldLocationHint::Text);
412                events.push(ParseEvent::FieldKey(key));
413                events.push(ParseEvent::Scalar(ScalarValue::Str(Cow::Owned(
414                    text.clone(),
415                ))));
416            }
417            ChildNode::Element(child_elem) => {
418                let key = FieldKey::new(
419                    Cow::Owned(child_elem.name.clone()),
420                    FieldLocationHint::Child,
421                );
422                events.push(ParseEvent::FieldKey(key));
423                emit_element_events(child_elem, events);
424            }
425        }
426    }
427
428    events.push(ParseEvent::StructEnd);
429}
430
431#[cfg(test)]
432mod tests {
433    use super::*;
434    use facet::Facet;
435    use facet_format::FormatDeserializer;
436
437    #[test]
438    fn test_simple_element() {
439        let html = b"<div>hello</div>";
440        let events = build_events(html).unwrap();
441        // Elements with text now emit struct with _text child
442        assert_eq!(
443            events,
444            vec![
445                ParseEvent::StructStart(ContainerKind::Element),
446                ParseEvent::FieldKey(FieldKey::new(
447                    Cow::Borrowed("_text"),
448                    FieldLocationHint::Text
449                )),
450                ParseEvent::Scalar(ScalarValue::Str(Cow::Owned("hello".into()))),
451                ParseEvent::StructEnd,
452            ]
453        );
454    }
455
456    #[test]
457    fn test_element_with_attribute() {
458        let html = b"<div class=\"foo\">hello</div>";
459        let events = build_events(html).unwrap();
460        assert_eq!(
461            events,
462            vec![
463                ParseEvent::StructStart(ContainerKind::Element),
464                ParseEvent::FieldKey(FieldKey::new(
465                    Cow::Owned("class".into()),
466                    FieldLocationHint::Attribute
467                )),
468                ParseEvent::Scalar(ScalarValue::Str(Cow::Owned("foo".into()))),
469                ParseEvent::FieldKey(FieldKey::new(
470                    Cow::Owned("_text".into()),
471                    FieldLocationHint::Text
472                )),
473                ParseEvent::Scalar(ScalarValue::Str(Cow::Owned("hello".into()))),
474                ParseEvent::StructEnd,
475            ]
476        );
477    }
478
479    #[test]
480    fn test_nested_elements() {
481        let html = b"<div><span>inner</span></div>";
482        let events = build_events(html).unwrap();
483        // Nested elements now emit struct with _text for inner content
484        assert_eq!(
485            events,
486            vec![
487                ParseEvent::StructStart(ContainerKind::Element),
488                ParseEvent::FieldKey(FieldKey::new(
489                    Cow::Owned("span".into()),
490                    FieldLocationHint::Child
491                )),
492                ParseEvent::StructStart(ContainerKind::Element),
493                ParseEvent::FieldKey(FieldKey::new(
494                    Cow::Borrowed("_text"),
495                    FieldLocationHint::Text
496                )),
497                ParseEvent::Scalar(ScalarValue::Str(Cow::Owned("inner".into()))),
498                ParseEvent::StructEnd,
499                ParseEvent::StructEnd,
500            ]
501        );
502    }
503
504    #[test]
505    fn test_void_element() {
506        let html = b"<div><br><span>after</span></div>";
507        let events = build_events(html).unwrap();
508        // br is a void element, should be parsed correctly
509        assert!(!events.is_empty());
510    }
511
512    #[test]
513    fn test_deserialize_simple_struct() {
514        #[derive(Debug, Facet, PartialEq)]
515        struct Div {
516            #[facet(default)]
517            class: Option<String>,
518        }
519
520        let html = b"<div class=\"container\"></div>";
521        let parser = HtmlParser::new(html);
522        let mut deserializer = FormatDeserializer::new(parser);
523        let result: Div = deserializer.deserialize().unwrap();
524        assert_eq!(result.class, Some("container".into()));
525    }
526
527    #[test]
528    fn test_deserialize_nested() {
529        use facet_xml as xml;
530
531        #[derive(Debug, Facet, PartialEq)]
532        struct Outer {
533            #[facet(default)]
534            inner: Option<Inner>,
535        }
536
537        #[derive(Debug, Facet, PartialEq)]
538        struct Inner {
539            #[facet(default)]
540            value: Option<Value>,
541        }
542
543        #[derive(Debug, Facet, PartialEq)]
544        struct Value {
545            #[facet(xml::text, default)]
546            text: String,
547        }
548
549        let html = b"<outer><inner><value>hello</value></inner></outer>";
550        let parser = HtmlParser::new(html);
551        let mut deserializer = FormatDeserializer::new(parser);
552        let result: Outer = deserializer.deserialize().unwrap();
553        assert_eq!(
554            result,
555            Outer {
556                inner: Some(Inner {
557                    value: Some(Value {
558                        text: "hello".into()
559                    })
560                })
561            }
562        );
563    }
564
565    #[test]
566    fn test_deserialize_with_text() {
567        use facet_xml as xml;
568
569        #[derive(Debug, Facet, PartialEq)]
570        struct Article {
571            #[facet(default)]
572            title: Option<TitleElement>,
573            #[facet(default)]
574            content: Option<ContentElement>,
575        }
576
577        #[derive(Debug, Facet, PartialEq)]
578        struct TitleElement {
579            #[facet(xml::text, default)]
580            text: String,
581        }
582
583        #[derive(Debug, Facet, PartialEq)]
584        struct ContentElement {
585            #[facet(xml::text, default)]
586            text: String,
587        }
588
589        let html = b"<article><title>Hello</title><content>World</content></article>";
590        let parser = HtmlParser::new(html);
591        let mut deserializer = FormatDeserializer::new(parser);
592        let result: Article = deserializer.deserialize().unwrap();
593        assert_eq!(
594            result,
595            Article {
596                title: Some(TitleElement {
597                    text: "Hello".into()
598                }),
599                content: Some(ContentElement {
600                    text: "World".into()
601                })
602            }
603        );
604    }
605
606    #[test]
607    fn test_deserialize_multiple_attributes() {
608        #[derive(Debug, Facet, PartialEq)]
609        struct Link {
610            #[facet(default)]
611            href: Option<String>,
612            #[facet(default)]
613            target: Option<String>,
614            #[facet(default)]
615            rel: Option<String>,
616        }
617
618        let html = b"<a href=\"https://example.com\" target=\"_blank\" rel=\"noopener\"></a>";
619        let parser = HtmlParser::new(html);
620        let mut deserializer = FormatDeserializer::new(parser);
621        let result: Link = deserializer.deserialize().unwrap();
622        assert_eq!(
623            result,
624            Link {
625                href: Some("https://example.com".into()),
626                target: Some("_blank".into()),
627                rel: Some("noopener".into())
628            }
629        );
630    }
631
632    #[test]
633    fn test_deserialize_predefined_img() {
634        use crate::elements::Img;
635
636        let html = b"<img src=\"photo.jpg\" alt=\"A photo\" width=\"100\" height=\"200\">";
637        let parser = HtmlParser::new(html);
638        let mut deserializer = FormatDeserializer::new(parser);
639        let result: Img = deserializer.deserialize().unwrap();
640        assert_eq!(result.src, Some("photo.jpg".into()));
641        assert_eq!(result.alt, Some("A photo".into()));
642        assert_eq!(result.width, Some("100".into()));
643        assert_eq!(result.height, Some("200".into()));
644    }
645
646    #[test]
647    fn test_deserialize_predefined_a() {
648        use crate::elements::{A, PhrasingContent};
649
650        let html = b"<a href=\"https://example.com\" target=\"_blank\">Click me</a>";
651        let parser = HtmlParser::new(html);
652        let mut deserializer = FormatDeserializer::new(parser);
653        let result: A = deserializer.deserialize().unwrap();
654        assert_eq!(result.href, Some("https://example.com".into()));
655        assert_eq!(result.target, Some("_blank".into()));
656        assert_eq!(result.children.len(), 1);
657        assert!(matches!(&result.children[0], PhrasingContent::Text(t) if t == "Click me"));
658    }
659
660    #[test]
661    fn test_deserialize_predefined_div_with_class() {
662        use crate::elements::{Div, FlowContent};
663
664        let html = b"<div class=\"container\" id=\"main\">Hello World</div>";
665        let parser = HtmlParser::new(html);
666        let mut deserializer = FormatDeserializer::new(parser);
667        let result: Div = deserializer.deserialize().unwrap();
668        assert_eq!(result.attrs.class, Some("container".into()));
669        assert_eq!(result.attrs.id, Some("main".into()));
670        assert_eq!(result.children.len(), 1);
671        assert!(matches!(&result.children[0], FlowContent::Text(t) if t == "Hello World"));
672    }
673
674    #[test]
675    fn test_mixed_content_events() {
676        // Test: <p>Hello <strong>world</strong> there</p>
677        // Should produce events with text nodes in their correct positions
678        let html = b"<p>Hello <strong>world</strong> there</p>";
679        let events = build_events(html).unwrap();
680
681        // Should have:
682        // StructStart (p)
683        // FieldKey(_text) -> "Hello"
684        // FieldKey(strong) -> StructStart, FieldKey(_text), "world", StructEnd
685        // FieldKey(_text) -> "there"
686        // StructEnd
687        assert_eq!(
688            events,
689            vec![
690                ParseEvent::StructStart(ContainerKind::Element),
691                ParseEvent::FieldKey(FieldKey::new(
692                    Cow::Borrowed("_text"),
693                    FieldLocationHint::Text
694                )),
695                ParseEvent::Scalar(ScalarValue::Str(Cow::Owned("Hello".into()))),
696                ParseEvent::FieldKey(FieldKey::new(
697                    Cow::Owned("strong".into()),
698                    FieldLocationHint::Child
699                )),
700                ParseEvent::StructStart(ContainerKind::Element),
701                ParseEvent::FieldKey(FieldKey::new(
702                    Cow::Borrowed("_text"),
703                    FieldLocationHint::Text
704                )),
705                ParseEvent::Scalar(ScalarValue::Str(Cow::Owned("world".into()))),
706                ParseEvent::StructEnd,
707                ParseEvent::FieldKey(FieldKey::new(
708                    Cow::Borrowed("_text"),
709                    FieldLocationHint::Text
710                )),
711                ParseEvent::Scalar(ScalarValue::Str(Cow::Owned("there".into()))),
712                ParseEvent::StructEnd,
713            ]
714        );
715    }
716
717    #[test]
718    fn test_mixed_content_deserialization() {
719        use crate::elements::{P, PhrasingContent};
720
721        // Test: <p>Hello <strong>world</strong> there</p>
722        let html = b"<p>Hello <strong>world</strong> there</p>";
723        let parser = HtmlParser::new(html);
724        let mut deserializer = FormatDeserializer::new(parser);
725        let result: P = deserializer.deserialize().unwrap();
726
727        // The children should have the interleaved text and element nodes
728        assert_eq!(result.children.len(), 3);
729        assert!(matches!(&result.children[0], PhrasingContent::Text(t) if t == "Hello"));
730        // Strong now has children, not a text field
731        if let PhrasingContent::Strong(strong) = &result.children[1] {
732            assert_eq!(strong.children.len(), 1);
733            assert!(matches!(&strong.children[0], PhrasingContent::Text(t) if t == "world"));
734        } else {
735            panic!("Expected Strong element");
736        }
737        assert!(matches!(&result.children[2], PhrasingContent::Text(t) if t == "there"));
738    }
739
740    #[test]
741    fn test_mixed_content_multiple_elements() {
742        use crate::elements::{P, PhrasingContent};
743
744        // Test: <p>Start <strong>bold</strong> middle <em>italic</em> end</p>
745        let html = b"<p>Start <strong>bold</strong> middle <em>italic</em> end</p>";
746        let parser = HtmlParser::new(html);
747        let mut deserializer = FormatDeserializer::new(parser);
748        let result: P = deserializer.deserialize().unwrap();
749
750        assert_eq!(result.children.len(), 5);
751        assert!(matches!(&result.children[0], PhrasingContent::Text(t) if t == "Start"));
752        // Strong and Em now have children, not text fields
753        if let PhrasingContent::Strong(strong) = &result.children[1] {
754            assert_eq!(strong.children.len(), 1);
755            assert!(matches!(&strong.children[0], PhrasingContent::Text(t) if t == "bold"));
756        } else {
757            panic!("Expected Strong element");
758        }
759        assert!(matches!(&result.children[2], PhrasingContent::Text(t) if t == "middle"));
760        if let PhrasingContent::Em(em) = &result.children[3] {
761            assert_eq!(em.children.len(), 1);
762            assert!(matches!(&em.children[0], PhrasingContent::Text(t) if t == "italic"));
763        } else {
764            panic!("Expected Em element");
765        }
766        assert!(matches!(&result.children[4], PhrasingContent::Text(t) if t == "end"));
767    }
768
769    #[test]
770    fn test_deserialize_meta_charset() {
771        use crate::elements::Meta;
772
773        // Regression test for https://github.com/facet-rs/facet/issues/1527
774        // meta charset="utf-8" was failing with:
775        // "type mismatch: expected struct start, got Scalar(Str("utf-8"))"
776        let html = b"<meta charset=\"utf-8\">";
777        let parser = HtmlParser::new(html);
778        let mut deserializer = FormatDeserializer::new(parser);
779        let result: Meta = deserializer.deserialize().unwrap();
780        assert_eq!(result.charset, Some("utf-8".into()));
781    }
782
783    #[test]
784    fn test_deserialize_head_with_meta_charset() {
785        use crate::elements::Head;
786
787        // Regression test for https://github.com/facet-rs/facet/issues/1527
788        // The bug occurs when meta is inside head
789        let html = b"<head><meta charset=\"utf-8\"><title>Test</title></head>";
790        let parser = HtmlParser::new(html);
791        let mut deserializer = FormatDeserializer::new(parser);
792        let result: Head = deserializer.deserialize().unwrap();
793
794        // Head has dedicated fields for meta elements
795        assert!(!result.meta.is_empty(), "Should have a meta element");
796        assert_eq!(result.meta[0].charset, Some("utf-8".into()));
797    }
798
799    #[test]
800    fn test_deserialize_full_html_document_with_meta_charset() {
801        use crate::elements::Html;
802
803        // Full reproduction from https://github.com/facet-rs/facet/issues/1527
804        let html = br#"<!DOCTYPE html>
805<html>
806<head>
807    <meta charset="utf-8">
808    <title>Test Page</title>
809</head>
810<body>
811    <p>Hello</p>
812</body>
813</html>"#;
814
815        let parser = HtmlParser::new(html);
816        let mut deserializer = FormatDeserializer::new(parser);
817        let result: Html = deserializer.deserialize().unwrap();
818
819        // Verify head was parsed correctly
820        let head = result.head.as_ref().expect("Should have head");
821        assert!(!head.meta.is_empty(), "Should have meta elements");
822        assert_eq!(head.meta[0].charset, Some("utf-8".into()));
823
824        // Verify title
825        let title = head.title.as_ref().expect("Should have title");
826        assert_eq!(title.text, "Test Page");
827
828        // Verify body exists
829        assert!(result.body.is_some(), "Should have body");
830    }
831}