html-cat 0.1.0

HTML5 parser: tokenizer + tree builder producing a Document tree of Element/Text/Comment nodes. No mut, no Rc/Arc, no interior mutability, no panics, exhaustive matches. First sub-crate of a Servo-replacement webview runtime targeting Tauri.
//! Document, Element, Text, Comment nodes.

use crate::attr::Attributes;
use crate::span::Span;

/// A complete parsed HTML document.
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct Document {
    doctype: Option<Doctype>,
    root: Element,
    span: Span,
}

impl Document {
    /// Build a document.
    #[must_use]
    pub fn new(doctype: Option<Doctype>, root: Element, span: Span) -> Self {
        Self {
            doctype,
            root,
            span,
        }
    }

    /// The optional `<!DOCTYPE ...>` declaration.
    #[must_use]
    pub fn doctype(&self) -> Option<&Doctype> {
        self.doctype.as_ref()
    }

    /// The root `<html>` element.
    #[must_use]
    pub fn root(&self) -> &Element {
        &self.root
    }

    /// The span covering the entire document.
    #[must_use]
    pub fn span(&self) -> Span {
        self.span
    }
}

/// A `<!DOCTYPE ...>` declaration.
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct Doctype {
    name: String,
    public_id: Option<String>,
    system_id: Option<String>,
    span: Span,
}

impl Doctype {
    /// Build a doctype.
    #[must_use]
    pub fn new(
        name: impl Into<String>,
        public_id: Option<String>,
        system_id: Option<String>,
        span: Span,
    ) -> Self {
        Self {
            name: name.into(),
            public_id,
            system_id,
            span,
        }
    }

    /// The root element name (typically `"html"`).
    #[must_use]
    pub fn name(&self) -> &str {
        &self.name
    }

    /// The optional PUBLIC identifier (legacy DTDs).
    #[must_use]
    pub fn public_id(&self) -> Option<&str> {
        self.public_id.as_deref()
    }

    /// The optional SYSTEM identifier (legacy DTDs).
    #[must_use]
    pub fn system_id(&self) -> Option<&str> {
        self.system_id.as_deref()
    }

    /// The span of the doctype declaration.
    #[must_use]
    pub fn span(&self) -> Span {
        self.span
    }
}

/// A child node of an [`Element`].
#[derive(Debug, Clone, PartialEq, Eq)]
pub enum Node {
    /// A nested element.
    Element(Element),
    /// A text run.
    Text(Text),
    /// A `<!-- comment -->`.
    Comment(Comment),
}

impl Node {
    /// The span of the node.
    #[must_use]
    pub fn span(&self) -> Span {
        match self {
            Self::Element(e) => e.span(),
            Self::Text(t) => t.span(),
            Self::Comment(c) => c.span(),
        }
    }
}

/// An HTML element.
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct Element {
    name: String,
    attributes: Attributes,
    children: Vec<Node>,
    self_closing: bool,
    span: Span,
}

impl Element {
    /// Build an element.
    #[must_use]
    pub fn new(
        name: impl Into<String>,
        attributes: Attributes,
        children: Vec<Node>,
        self_closing: bool,
        span: Span,
    ) -> Self {
        Self {
            name: name.into(),
            attributes,
            children,
            self_closing,
            span,
        }
    }

    /// The tag name, lower-cased per HTML5.
    #[must_use]
    pub fn name(&self) -> &str {
        &self.name
    }

    /// The attributes in source order.
    #[must_use]
    pub fn attributes(&self) -> &Attributes {
        &self.attributes
    }

    /// The child nodes in source order.
    #[must_use]
    pub fn children(&self) -> &[Node] {
        &self.children
    }

    /// Whether the element was written self-closing (`<br />`) or is a
    /// void element.
    #[must_use]
    pub fn is_self_closing(&self) -> bool {
        self.self_closing
    }

    /// The span of the element from the opening tag to (and including)
    /// the closing tag.
    #[must_use]
    pub fn span(&self) -> Span {
        self.span
    }
}

/// A text run.
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct Text {
    content: String,
    span: Span,
}

impl Text {
    /// Build a text node.
    #[must_use]
    pub fn new(content: impl Into<String>, span: Span) -> Self {
        Self {
            content: content.into(),
            span,
        }
    }

    /// The decoded text content (entities resolved).
    #[must_use]
    pub fn content(&self) -> &str {
        &self.content
    }

    /// The span of the text run.
    #[must_use]
    pub fn span(&self) -> Span {
        self.span
    }
}

/// A `<!-- comment -->`.
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct Comment {
    text: String,
    span: Span,
}

impl Comment {
    /// Build a comment.
    #[must_use]
    pub fn new(text: impl Into<String>, span: Span) -> Self {
        Self {
            text: text.into(),
            span,
        }
    }

    /// The comment text (between `<!--` and `-->`).
    #[must_use]
    pub fn text(&self) -> &str {
        &self.text
    }

    /// The span of the comment.
    #[must_use]
    pub fn span(&self) -> Span {
        self.span
    }
}

/// Whether `name` is a void element per HTML5 -- one that does not take
/// children and is automatically closed.
#[must_use]
pub fn is_void_element(name: &str) -> bool {
    matches!(
        name,
        "area"
            | "base"
            | "br"
            | "col"
            | "embed"
            | "hr"
            | "img"
            | "input"
            | "link"
            | "meta"
            | "source"
            | "track"
            | "wbr"
    )
}

/// Whether `name` is a raw-text element whose contents are not parsed as
/// HTML (only the matching close tag terminates the run).
#[must_use]
pub fn is_raw_text_element(name: &str) -> bool {
    matches!(name, "script" | "style")
}

/// Whether `name` is an escapable raw-text element (entities are decoded
/// but no nested tags are recognised).
#[must_use]
pub fn is_escapable_raw_text_element(name: &str) -> bool {
    matches!(name, "textarea" | "title")
}