Skip to main content

typst_html/
encode.rs

1use std::fmt::Write;
2
3use comemo::{Track, Tracked};
4use ecow::{EcoString, eco_format};
5use typst_library::diag::{At, SourceResult, StrResult, bail};
6use typst_library::foundations::Repr;
7use typst_library::model::LateLinkResolver;
8use typst_syntax::Span;
9
10use crate::{
11    HtmlDocument, HtmlElement, HtmlFrame, HtmlNode, HtmlTag, attr, charsets, property,
12    tag,
13};
14
15/// Settings for HTML export.
16#[derive(Debug, Default, Clone, Eq, PartialEq, Hash)]
17pub struct HtmlOptions {
18    /// Whether to format the HTML in a human-readable way.
19    pub pretty: bool,
20}
21
22/// Encodes an HTML document into a string.
23pub fn html(document: &HtmlDocument, options: &HtmlOptions) -> SourceResult<String> {
24    let link_resolver = LateLinkResolver::new(None, document.introspector().as_ref());
25    let w = Writer::new(link_resolver.track(), options.pretty);
26    html_impl(w, document.root())
27}
28
29/// Encodes an HTML root element into a string as part of a bundle.
30///
31/// See `export_html` in `typst-bundle` for more details on why this takes the
32/// root element instead of the document.
33pub fn html_in_bundle(
34    root: &HtmlElement,
35    options: &HtmlOptions,
36    link_resolver: Tracked<LateLinkResolver>,
37) -> SourceResult<String> {
38    let w = Writer::new(link_resolver, options.pretty);
39    html_impl(w, root)
40}
41
42/// The shared implementation of [`html`] and [`html_in_bundle`].
43fn html_impl(mut w: Writer, root: &HtmlElement) -> SourceResult<String> {
44    w.buf.push_str("<!DOCTYPE html>");
45    write_indent(&mut w);
46    write_element(&mut w, root)?;
47    if w.pretty {
48        w.buf.push('\n');
49    }
50    Ok(w.buf)
51}
52
53/// Encodes HTML.
54struct Writer<'a> {
55    /// The output buffer.
56    buf: String,
57    /// The current indentation level
58    level: usize,
59    /// Used to resolve links between the document and contained frames as well
60    /// as cross-document links in bundle export.
61    link_resolver: Tracked<'a, LateLinkResolver<'a>>,
62    /// Whether pretty printing is enabled.
63    pretty: bool,
64}
65
66impl<'a> Writer<'a> {
67    /// Creates a new writer.
68    fn new(link_resolver: Tracked<'a, LateLinkResolver<'a>>, pretty: bool) -> Self {
69        Self {
70            buf: String::new(),
71            level: 0,
72            link_resolver,
73            pretty,
74        }
75    }
76}
77
78/// Writes a newline and indent, if pretty printing is enabled.
79fn write_indent(w: &mut Writer) {
80    if w.pretty {
81        w.buf.push('\n');
82        for _ in 0..w.level {
83            w.buf.push_str("  ");
84        }
85    }
86}
87
88/// Encodes an HTML node into the writer.
89fn write_node(w: &mut Writer, node: &HtmlNode, escape_text: bool) -> SourceResult<()> {
90    match node {
91        HtmlNode::Tag(_) => {}
92        HtmlNode::Text(text, span) => write_text(w, text, *span, escape_text)?,
93        HtmlNode::Element(element) => write_element(w, element)?,
94        HtmlNode::Frame(frame) => write_frame(w, frame),
95    }
96    Ok(())
97}
98
99/// Encodes plain text into the writer.
100fn write_text(w: &mut Writer, text: &str, span: Span, escape: bool) -> SourceResult<()> {
101    for c in text.chars() {
102        if escape || !charsets::is_valid_in_normal_element_text(c) {
103            write_escape(w, c).at(span)?;
104        } else {
105            w.buf.push(c);
106        }
107    }
108    Ok(())
109}
110
111/// Encodes one element into the writer.
112fn write_element(w: &mut Writer, element: &HtmlElement) -> SourceResult<()> {
113    w.buf.push('<');
114    w.buf.push_str(&element.tag.resolve());
115
116    for (attr, value) in &element.attrs.0 {
117        w.buf.push(' ');
118        w.buf.push_str(&attr.resolve());
119
120        // If the string is empty, we can use shorthand syntax.
121        // `<elem attr="">..</div` is equivalent to `<elem attr>..</div>`
122        if !value.is_empty() {
123            w.buf.push('=');
124            w.buf.push('"');
125            for c in value.chars() {
126                if charsets::is_valid_in_attribute_value(c) {
127                    w.buf.push(c);
128                } else {
129                    write_escape(w, c).at(element.span)?;
130                }
131            }
132            w.buf.push('"');
133        }
134    }
135
136    if tag::is_foreign_self_closing(element.tag) {
137        w.buf.push('/');
138    }
139
140    w.buf.push('>');
141
142    if tag::is_void(element.tag) || tag::is_foreign_self_closing(element.tag) {
143        if !element.children.is_empty() {
144            bail!(element.span, "HTML void elements must not have children");
145        }
146        return Ok(());
147    }
148
149    // See HTML spec § 13.1.2.5.
150    if matches!(element.tag, tag::pre | tag::textarea) && starts_with_newline(element) {
151        w.buf.push('\n');
152    }
153
154    if tag::is_raw(element.tag) {
155        write_raw(w, element)?;
156    } else if tag::is_escapable_raw(element.tag) {
157        write_escapable_raw(w, element)?;
158    } else if !element.children.is_empty() {
159        write_children(w, element)?;
160    }
161
162    w.buf.push_str("</");
163    w.buf.push_str(&element.tag.resolve());
164    w.buf.push('>');
165
166    Ok(())
167}
168
169/// Encodes the children of an element.
170fn write_children(w: &mut Writer, element: &HtmlElement) -> SourceResult<()> {
171    let pretty = w.pretty;
172    let pretty_inside = allows_pretty_inside(element.tag)
173        && element.children.iter().any(|node| match node {
174            HtmlNode::Element(child) => wants_pretty_around(child),
175            HtmlNode::Frame(_) => true,
176            _ => false,
177        });
178
179    w.pretty &= pretty_inside;
180    let mut indent = w.pretty;
181
182    w.level += 1;
183    for c in &element.children {
184        let pretty_around = match c {
185            HtmlNode::Tag(_) => continue,
186            HtmlNode::Element(child) => w.pretty && wants_pretty_around(child),
187            HtmlNode::Text(..) | HtmlNode::Frame(_) => false,
188        };
189
190        if core::mem::take(&mut indent) || pretty_around {
191            write_indent(w);
192        }
193        write_node(w, c, element.pre_span)?;
194        indent = pretty_around;
195    }
196    w.level -= 1;
197
198    write_indent(w);
199    w.pretty = pretty;
200
201    Ok(())
202}
203
204/// Whether the first character in the element is a newline.
205fn starts_with_newline(element: &HtmlElement) -> bool {
206    for child in &element.children {
207        match child {
208            HtmlNode::Tag(_) => {}
209            HtmlNode::Text(text, _) => return text.starts_with(['\n', '\r']),
210            _ => return false,
211        }
212    }
213    false
214}
215
216/// Encodes the contents of a raw text element.
217fn write_raw(w: &mut Writer, element: &HtmlElement) -> SourceResult<()> {
218    let text = collect_raw_text(element)?;
219
220    if let Some(closing) = find_closing_tag(&text, element.tag) {
221        bail!(
222            element.span,
223            "HTML raw text element cannot contain its own closing tag";
224            hint: "the sequence `{closing}` appears in the raw text";
225        )
226    }
227
228    let mode = if w.pretty { RawMode::of(element, &text) } else { RawMode::Keep };
229    match mode {
230        RawMode::Keep => {
231            w.buf.push_str(&text);
232        }
233        RawMode::Wrap => {
234            w.buf.push('\n');
235            w.buf.push_str(&text);
236            write_indent(w);
237        }
238        RawMode::Indent => {
239            w.level += 1;
240            for line in text.lines() {
241                write_indent(w);
242                w.buf.push_str(line);
243            }
244            w.level -= 1;
245            write_indent(w);
246        }
247    }
248
249    Ok(())
250}
251
252/// Encodes the contents of an escapable raw text element.
253fn write_escapable_raw(w: &mut Writer, element: &HtmlElement) -> SourceResult<()> {
254    walk_raw_text(element, |piece, span| write_text(w, piece, span, false))
255}
256
257/// Collects the textual contents of a raw text element.
258fn collect_raw_text(element: &HtmlElement) -> SourceResult<String> {
259    let mut text = String::new();
260    walk_raw_text(element, |piece, span| {
261        if let Some(c) = piece.chars().find(|&c| !charsets::is_w3c_text_char(c)) {
262            return Err(unencodable(c)).at(span);
263        }
264        text.push_str(piece);
265        Ok(())
266    })?;
267    Ok(text)
268}
269
270/// Iterates over the textual contents of a raw text element.
271fn walk_raw_text(
272    element: &HtmlElement,
273    mut f: impl FnMut(&str, Span) -> SourceResult<()>,
274) -> SourceResult<()> {
275    for c in &element.children {
276        match c {
277            HtmlNode::Tag(_) => continue,
278            HtmlNode::Text(text, span) => f(text, *span)?,
279            HtmlNode::Element(HtmlElement { span, .. })
280            | HtmlNode::Frame(HtmlFrame { span, .. }) => {
281                bail!(*span, "HTML raw text element cannot have non-text children")
282            }
283        }
284    }
285    Ok(())
286}
287
288/// Finds a closing sequence for the given tag in the text, if it exists.
289///
290/// See HTML spec § 13.1.2.6.
291fn find_closing_tag(text: &str, tag: HtmlTag) -> Option<&str> {
292    let s = tag.resolve();
293    let len = s.len();
294    text.match_indices("</").find_map(|(i, _)| {
295        let rest = &text[i + 2..];
296        let disallowed = rest.len() >= len
297            && rest[..len].eq_ignore_ascii_case(&s)
298            && rest[len..].starts_with(['\t', '\n', '\u{c}', '\r', ' ', '>', '/']);
299        disallowed.then(|| &text[i..i + 2 + len])
300    })
301}
302
303/// How to format the contents of a raw text element.
304enum RawMode {
305    /// Just don't touch it.
306    Keep,
307    /// Newline after the opening and newline + indent before the closing tag.
308    Wrap,
309    /// Newlines after opening and before closing tag and each line indented.
310    Indent,
311}
312
313impl RawMode {
314    fn of(element: &HtmlElement, text: &str) -> Self {
315        match element.tag {
316            tag::script
317                if !element.attrs.0.iter().any(|(attr, value)| {
318                    *attr == attr::r#type && value != "text/javascript"
319                }) =>
320            {
321                // Template literals can be multi-line, so indent may change
322                // the semantics of the JavaScript.
323                if text.contains('`') { Self::Wrap } else { Self::Indent }
324            }
325            tag::style => Self::Indent,
326            _ => Self::Keep,
327        }
328    }
329}
330
331/// Whether we are allowed to add an extra newline at the start and end of the
332/// element's contents.
333///
334/// Technically, users can change CSS `display` properties such that the
335/// insertion of whitespace may actually impact the visual output. For example,
336/// <https://www.w3.org/TR/css-text-3/#example-af2745cd> shows how adding CSS
337/// rules to `<p>` can make it sensitive to whitespace. For this reason, we
338/// should also respect the `style` tag in the future.
339fn allows_pretty_inside(tag: HtmlTag) -> bool {
340    if tag::mathml::is_mathml(tag) && !tag::mathml::is_token(tag) {
341        return true;
342    }
343    let Some(display) = property::Display::default_for(tag) else { return false };
344    (display == property::Display::Block && tag != tag::pre)
345        || display.is_tabular()
346        || display == property::Display::ListItem
347        || tag == tag::head
348}
349
350/// Whether newlines should be added before and after the element if the parent
351/// allows it.
352///
353/// In contrast to `allows_pretty_inside`, which is purely spec-driven, this is
354/// more subjective and depends on preference.
355fn wants_pretty_around(element: &HtmlElement) -> bool {
356    match element.tag {
357        tag::mathml::math => {
358            element.attrs.get(attr::mathml::display).is_some_and(|v| v == "block")
359        }
360        t if tag::mathml::is_mathml(t) => true,
361        tag::pre => true,
362        t if tag::is_metadata_content(t) => true,
363        t => allows_pretty_inside(t),
364    }
365}
366
367/// Escape a character.
368fn write_escape(w: &mut Writer, c: char) -> StrResult<()> {
369    // See <https://html.spec.whatwg.org/multipage/syntax.html#syntax-charref>
370    match c {
371        '&' => w.buf.push_str("&amp;"),
372        '<' => w.buf.push_str("&lt;"),
373        '>' => w.buf.push_str("&gt;"),
374        '"' => w.buf.push_str("&quot;"),
375        '\'' => w.buf.push_str("&apos;"),
376        c if charsets::is_w3c_text_char(c) && c != '\r' => {
377            write!(w.buf, "&#x{:x};", c as u32).unwrap()
378        }
379        _ => return Err(unencodable(c)),
380    }
381    Ok(())
382}
383
384/// The error message for a character that cannot be encoded.
385#[cold]
386fn unencodable(c: char) -> EcoString {
387    eco_format!("the character `{}` cannot be encoded in HTML", c.repr())
388}
389
390/// Encode a laid out frame into the writer.
391fn write_frame(w: &mut Writer, frame: &HtmlFrame) {
392    let svg = typst_svg::svg_in_html(
393        &frame.inner,
394        frame.text_size,
395        w.pretty,
396        frame.id.as_deref(),
397        &eco_format!("{}", frame.css.to_inline()),
398        &frame.anchors,
399        w.link_resolver,
400    );
401    w.buf.push_str(&svg);
402}