typst_library/html/
dom.rs

1use std::fmt::{self, Debug, Display, Formatter};
2
3use ecow::{EcoString, EcoVec};
4use typst_syntax::Span;
5use typst_utils::{PicoStr, ResolvedPicoStr};
6
7use crate::diag::{bail, HintedStrResult, StrResult};
8use crate::foundations::{cast, Dict, Repr, Str};
9use crate::introspection::{Introspector, Tag};
10use crate::layout::Frame;
11use crate::model::DocumentInfo;
12
13/// An HTML document.
14#[derive(Debug, Clone)]
15pub struct HtmlDocument {
16    /// The document's root HTML element.
17    pub root: HtmlElement,
18    /// Details about the document.
19    pub info: DocumentInfo,
20    /// Provides the ability to execute queries on the document.
21    pub introspector: Introspector,
22}
23
24/// A child of an HTML element.
25#[derive(Debug, Clone, Hash)]
26pub enum HtmlNode {
27    /// An introspectable element that produced something within this node.
28    Tag(Tag),
29    /// Plain text.
30    Text(EcoString, Span),
31    /// Another element.
32    Element(HtmlElement),
33    /// A frame that will be displayed as an embedded SVG.
34    Frame(Frame),
35}
36
37impl HtmlNode {
38    /// Create a plain text node.
39    pub fn text(text: impl Into<EcoString>, span: Span) -> Self {
40        Self::Text(text.into(), span)
41    }
42}
43
44impl From<HtmlElement> for HtmlNode {
45    fn from(element: HtmlElement) -> Self {
46        Self::Element(element)
47    }
48}
49
50/// An HTML element.
51#[derive(Debug, Clone, Hash)]
52pub struct HtmlElement {
53    /// The HTML tag.
54    pub tag: HtmlTag,
55    /// The element's attributes.
56    pub attrs: HtmlAttrs,
57    /// The element's children.
58    pub children: Vec<HtmlNode>,
59    /// The span from which the element originated, if any.
60    pub span: Span,
61}
62
63impl HtmlElement {
64    /// Create a new, blank element without attributes or children.
65    pub fn new(tag: HtmlTag) -> Self {
66        Self {
67            tag,
68            attrs: HtmlAttrs::default(),
69            children: vec![],
70            span: Span::detached(),
71        }
72    }
73
74    /// Attach children to the element.
75    ///
76    /// Note: This overwrites potential previous children.
77    pub fn with_children(mut self, children: Vec<HtmlNode>) -> Self {
78        self.children = children;
79        self
80    }
81
82    /// Add an atribute to the element.
83    pub fn with_attr(mut self, key: HtmlAttr, value: impl Into<EcoString>) -> Self {
84        self.attrs.push(key, value);
85        self
86    }
87
88    /// Attach a span to the element.
89    pub fn spanned(mut self, span: Span) -> Self {
90        self.span = span;
91        self
92    }
93}
94
95/// The tag of an HTML element.
96#[derive(Copy, Clone, Eq, PartialEq, Hash)]
97pub struct HtmlTag(PicoStr);
98
99impl HtmlTag {
100    /// Intern an HTML tag string at runtime.
101    pub fn intern(string: &str) -> StrResult<Self> {
102        if string.is_empty() {
103            bail!("tag name must not be empty");
104        }
105
106        if let Some(c) = string.chars().find(|&c| !charsets::is_valid_in_tag_name(c)) {
107            bail!("the character {} is not valid in a tag name", c.repr());
108        }
109
110        Ok(Self(PicoStr::intern(string)))
111    }
112
113    /// Creates a compile-time constant `HtmlTag`.
114    ///
115    /// Should only be used in const contexts because it can panic.
116    #[track_caller]
117    pub const fn constant(string: &'static str) -> Self {
118        if string.is_empty() {
119            panic!("tag name must not be empty");
120        }
121
122        let bytes = string.as_bytes();
123        let mut i = 0;
124        while i < bytes.len() {
125            if !bytes[i].is_ascii() || !charsets::is_valid_in_tag_name(bytes[i] as char) {
126                panic!("not all characters are valid in a tag name");
127            }
128            i += 1;
129        }
130
131        Self(PicoStr::constant(string))
132    }
133
134    /// Resolves the tag to a string.
135    pub fn resolve(self) -> ResolvedPicoStr {
136        self.0.resolve()
137    }
138
139    /// Turns the tag into its inner interned string.
140    pub const fn into_inner(self) -> PicoStr {
141        self.0
142    }
143}
144
145impl Debug for HtmlTag {
146    fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result {
147        Display::fmt(self, f)
148    }
149}
150
151impl Display for HtmlTag {
152    fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result {
153        write!(f, "<{}>", self.resolve())
154    }
155}
156
157cast! {
158    HtmlTag,
159    self => self.0.resolve().as_str().into_value(),
160    v: Str => Self::intern(&v)?,
161}
162
163/// Attributes of an HTML element.
164#[derive(Debug, Default, Clone, Eq, PartialEq, Hash)]
165pub struct HtmlAttrs(pub EcoVec<(HtmlAttr, EcoString)>);
166
167impl HtmlAttrs {
168    /// Add an attribute.
169    pub fn push(&mut self, attr: HtmlAttr, value: impl Into<EcoString>) {
170        self.0.push((attr, value.into()));
171    }
172}
173
174cast! {
175    HtmlAttrs,
176    self => self.0
177        .into_iter()
178        .map(|(key, value)| (key.resolve().as_str().into(), value.into_value()))
179        .collect::<Dict>()
180        .into_value(),
181    values: Dict => Self(values
182        .into_iter()
183        .map(|(k, v)| {
184            let attr = HtmlAttr::intern(&k)?;
185            let value = v.cast::<EcoString>()?;
186            Ok((attr, value))
187        })
188        .collect::<HintedStrResult<_>>()?),
189}
190
191/// An attribute of an HTML.
192#[derive(Copy, Clone, Eq, PartialEq, Hash)]
193pub struct HtmlAttr(PicoStr);
194
195impl HtmlAttr {
196    /// Intern an HTML attribute string at runtime.
197    pub fn intern(string: &str) -> StrResult<Self> {
198        if string.is_empty() {
199            bail!("attribute name must not be empty");
200        }
201
202        if let Some(c) =
203            string.chars().find(|&c| !charsets::is_valid_in_attribute_name(c))
204        {
205            bail!("the character {} is not valid in an attribute name", c.repr());
206        }
207
208        Ok(Self(PicoStr::intern(string)))
209    }
210
211    /// Creates a compile-time constant `HtmlAttr`.
212    ///
213    /// Must only be used in const contexts (in a constant definition or
214    /// explicit `const { .. }` block) because otherwise a panic for a malformed
215    /// attribute or not auto-internible constant will only be caught at
216    /// runtime.
217    #[track_caller]
218    pub const fn constant(string: &'static str) -> Self {
219        if string.is_empty() {
220            panic!("attribute name must not be empty");
221        }
222
223        let bytes = string.as_bytes();
224        let mut i = 0;
225        while i < bytes.len() {
226            if !bytes[i].is_ascii()
227                || !charsets::is_valid_in_attribute_name(bytes[i] as char)
228            {
229                panic!("not all characters are valid in an attribute name");
230            }
231            i += 1;
232        }
233
234        Self(PicoStr::constant(string))
235    }
236
237    /// Resolves the attribute to a string.
238    pub fn resolve(self) -> ResolvedPicoStr {
239        self.0.resolve()
240    }
241
242    /// Turns the attribute into its inner interned string.
243    pub const fn into_inner(self) -> PicoStr {
244        self.0
245    }
246}
247
248impl Debug for HtmlAttr {
249    fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result {
250        Display::fmt(self, f)
251    }
252}
253
254impl Display for HtmlAttr {
255    fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result {
256        write!(f, "{}", self.resolve())
257    }
258}
259
260cast! {
261    HtmlAttr,
262    self => self.0.resolve().as_str().into_value(),
263    v: Str => Self::intern(&v)?,
264}
265
266/// Defines syntactical properties of HTML tags, attributes, and text.
267pub mod charsets {
268    /// Check whether a character is in a tag name.
269    pub const fn is_valid_in_tag_name(c: char) -> bool {
270        c.is_ascii_alphanumeric()
271    }
272
273    /// Check whether a character is valid in an attribute name.
274    pub const fn is_valid_in_attribute_name(c: char) -> bool {
275        match c {
276            // These are forbidden.
277            '\0' | ' ' | '"' | '\'' | '>' | '/' | '=' => false,
278            c if is_whatwg_control_char(c) => false,
279            c if is_whatwg_non_char(c) => false,
280            // _Everything_ else is allowed, including U+2029 paragraph
281            // separator. Go wild.
282            _ => true,
283        }
284    }
285
286    /// Check whether a character can be an used in an attribute value without
287    /// escaping.
288    ///
289    /// See <https://html.spec.whatwg.org/multipage/syntax.html#attributes-2>
290    pub const fn is_valid_in_attribute_value(c: char) -> bool {
291        match c {
292            // Ampersands are sometimes legal (i.e. when they are not _ambiguous
293            // ampersands_) but it is not worth the trouble to check for that.
294            '&' => false,
295            // Quotation marks are not allowed in double-quote-delimited attribute
296            // values.
297            '"' => false,
298            // All other text characters are allowed.
299            c => is_w3c_text_char(c),
300        }
301    }
302
303    /// Check whether a character can be an used in normal text without
304    /// escaping.
305    pub const fn is_valid_in_normal_element_text(c: char) -> bool {
306        match c {
307            // Ampersands are sometimes legal (i.e. when they are not _ambiguous
308            // ampersands_) but it is not worth the trouble to check for that.
309            '&' => false,
310            // Less-than signs are not allowed in text.
311            '<' => false,
312            // All other text characters are allowed.
313            c => is_w3c_text_char(c),
314        }
315    }
316
317    /// Check if something is valid text in HTML.
318    pub const fn is_w3c_text_char(c: char) -> bool {
319        match c {
320            // Non-characters are obviously not text characters.
321            c if is_whatwg_non_char(c) => false,
322            // Control characters are disallowed, except for whitespace.
323            c if is_whatwg_control_char(c) => c.is_ascii_whitespace(),
324            // Everything else is allowed.
325            _ => true,
326        }
327    }
328
329    const fn is_whatwg_non_char(c: char) -> bool {
330        match c {
331            '\u{fdd0}'..='\u{fdef}' => true,
332            // Non-characters matching xxFFFE or xxFFFF up to x10FFFF (inclusive).
333            c if c as u32 & 0xfffe == 0xfffe && c as u32 <= 0x10ffff => true,
334            _ => false,
335        }
336    }
337
338    const fn is_whatwg_control_char(c: char) -> bool {
339        match c {
340            // C0 control characters.
341            '\u{00}'..='\u{1f}' => true,
342            // Other control characters.
343            '\u{7f}'..='\u{9f}' => true,
344            _ => false,
345        }
346    }
347}
348
349/// Predefined constants for HTML tags.
350pub mod tag {
351    use super::HtmlTag;
352
353    macro_rules! tags {
354        ($($tag:ident)*) => {
355            $(#[allow(non_upper_case_globals)]
356            pub const $tag: HtmlTag = HtmlTag::constant(
357                stringify!($tag)
358            );)*
359        }
360    }
361
362    tags! {
363        a
364        abbr
365        address
366        area
367        article
368        aside
369        audio
370        b
371        base
372        bdi
373        bdo
374        blockquote
375        body
376        br
377        button
378        canvas
379        caption
380        cite
381        code
382        col
383        colgroup
384        data
385        datalist
386        dd
387        del
388        details
389        dfn
390        dialog
391        div
392        dl
393        dt
394        em
395        embed
396        fieldset
397        figcaption
398        figure
399        footer
400        form
401        h1
402        h2
403        h3
404        h4
405        h5
406        h6
407        head
408        header
409        hgroup
410        hr
411        html
412        i
413        iframe
414        img
415        input
416        ins
417        kbd
418        label
419        legend
420        li
421        link
422        main
423        map
424        mark
425        menu
426        meta
427        meter
428        nav
429        noscript
430        object
431        ol
432        optgroup
433        option
434        output
435        p
436        param
437        picture
438        pre
439        progress
440        q
441        rp
442        rt
443        ruby
444        s
445        samp
446        script
447        search
448        section
449        select
450        slot
451        small
452        source
453        span
454        strong
455        style
456        sub
457        summary
458        sup
459        table
460        tbody
461        td
462        template
463        textarea
464        tfoot
465        th
466        thead
467        time
468        title
469        tr
470        track
471        u
472        ul
473        var
474        video
475        wbr
476    }
477
478    /// Whether this is a void tag whose associated element may not have a
479    /// children.
480    pub fn is_void(tag: HtmlTag) -> bool {
481        matches!(
482            tag,
483            self::area
484                | self::base
485                | self::br
486                | self::col
487                | self::embed
488                | self::hr
489                | self::img
490                | self::input
491                | self::link
492                | self::meta
493                | self::param
494                | self::source
495                | self::track
496                | self::wbr
497        )
498    }
499
500    /// Whether this is a tag containing raw text.
501    pub fn is_raw(tag: HtmlTag) -> bool {
502        matches!(tag, self::script | self::style)
503    }
504
505    /// Whether this is a tag containing escapable raw text.
506    pub fn is_escapable_raw(tag: HtmlTag) -> bool {
507        matches!(tag, self::textarea | self::title)
508    }
509
510    /// Whether an element is considered metadata.
511    pub fn is_metadata(tag: HtmlTag) -> bool {
512        matches!(
513            tag,
514            self::base
515                | self::link
516                | self::meta
517                | self::noscript
518                | self::script
519                | self::style
520                | self::template
521                | self::title
522        )
523    }
524
525    /// Whether nodes with the tag have the CSS property `display: block` by
526    /// default.
527    pub fn is_block_by_default(tag: HtmlTag) -> bool {
528        matches!(
529            tag,
530            self::html
531                | self::head
532                | self::body
533                | self::article
534                | self::aside
535                | self::h1
536                | self::h2
537                | self::h3
538                | self::h4
539                | self::h5
540                | self::h6
541                | self::hgroup
542                | self::nav
543                | self::section
544                | self::dd
545                | self::dl
546                | self::dt
547                | self::menu
548                | self::ol
549                | self::ul
550                | self::address
551                | self::blockquote
552                | self::dialog
553                | self::div
554                | self::fieldset
555                | self::figure
556                | self::figcaption
557                | self::footer
558                | self::form
559                | self::header
560                | self::hr
561                | self::legend
562                | self::main
563                | self::p
564                | self::pre
565                | self::search
566        )
567    }
568
569    /// Whether the element is inline-level as opposed to being block-level.
570    ///
571    /// Not sure whether this distinction really makes sense. But we somehow
572    /// need to decide what to put into automatic paragraphs. A `<strong>`
573    /// should merged into a paragraph created by realization, but a `<div>`
574    /// shouldn't.
575    ///
576    /// <https://www.w3.org/TR/html401/struct/global.html#block-inline>
577    /// <https://developer.mozilla.org/en-US/docs/Glossary/Inline-level_content>
578    /// <https://github.com/orgs/mdn/discussions/353>
579    pub fn is_inline_by_default(tag: HtmlTag) -> bool {
580        matches!(
581            tag,
582            self::abbr
583                | self::a
584                | self::bdi
585                | self::b
586                | self::br
587                | self::bdo
588                | self::code
589                | self::cite
590                | self::dfn
591                | self::data
592                | self::i
593                | self::em
594                | self::mark
595                | self::kbd
596                | self::rp
597                | self::q
598                | self::ruby
599                | self::rt
600                | self::samp
601                | self::s
602                | self::span
603                | self::small
604                | self::sub
605                | self::strong
606                | self::time
607                | self::sup
608                | self::var
609                | self::u
610        )
611    }
612
613    /// Whether nodes with the tag have the CSS property `display: table(-.*)?`
614    /// by default.
615    pub fn is_tabular_by_default(tag: HtmlTag) -> bool {
616        matches!(
617            tag,
618            self::table
619                | self::thead
620                | self::tbody
621                | self::tfoot
622                | self::tr
623                | self::th
624                | self::td
625                | self::caption
626                | self::col
627                | self::colgroup
628        )
629    }
630}
631
632/// Predefined constants for HTML attributes.
633///
634/// Note: These are very incomplete.
635#[allow(non_upper_case_globals)]
636pub mod attr {
637    use super::HtmlAttr;
638
639    macro_rules! attrs {
640        ($($attr:ident)*) => {
641            $(#[allow(non_upper_case_globals)]
642            pub const $attr: HtmlAttr = HtmlAttr::constant(
643                stringify!($attr)
644            );)*
645        }
646    }
647
648    attrs! {
649        charset
650        cite
651        colspan
652        content
653        href
654        name
655        reversed
656        role
657        rowspan
658        start
659        style
660        value
661    }
662
663    pub const aria_level: HtmlAttr = HtmlAttr::constant("aria-level");
664}