css_inline/html/
serializer.rs

1use super::{
2    attributes::Attributes,
3    document::Document,
4    node::{ElementData, NodeData, NodeId},
5    DocumentStyleMap, InliningMode,
6};
7use crate::{html::ElementStyleMap, parser, InlineError};
8use html5ever::{local_name, namespace_url, ns, tendril::StrTendril, LocalName, QualName};
9use smallvec::{smallvec, SmallVec};
10use std::io::Write;
11
12pub(crate) fn serialize_to<W: Write>(
13    document: &Document,
14    writer: &mut W,
15    styles: DocumentStyleMap<'_>,
16    keep_style_tags: bool,
17    keep_link_tags: bool,
18    mode: InliningMode,
19) -> Result<(), InlineError> {
20    let sink = Sink::new(
21        document,
22        NodeId::document_id(),
23        keep_style_tags,
24        keep_link_tags,
25        mode,
26    );
27    let mut ser = HtmlSerializer::new(writer, styles);
28    sink.serialize(&mut ser)
29}
30
31/// Intermediary structure for serializing an HTML document.
32struct Sink<'a> {
33    document: &'a Document,
34    node: NodeId,
35    keep_style_tags: bool,
36    keep_link_tags: bool,
37    inlining_mode: InliningMode,
38}
39
40impl<'a> Sink<'a> {
41    fn new(
42        document: &'a Document,
43        node: NodeId,
44        keep_style_tags: bool,
45        keep_link_tags: bool,
46        inlining_mode: InliningMode,
47    ) -> Sink<'a> {
48        Sink {
49            document,
50            node,
51            keep_style_tags,
52            keep_link_tags,
53            inlining_mode,
54        }
55    }
56    #[inline]
57    fn for_node(&self, node: NodeId) -> Sink<'a> {
58        Sink::new(
59            self.document,
60            node,
61            self.keep_style_tags,
62            self.keep_link_tags,
63            self.inlining_mode,
64        )
65    }
66    #[inline]
67    fn data(&self) -> &NodeData {
68        &self.document[self.node].data
69    }
70    #[inline]
71    fn should_skip_element(&self, element: &ElementData) -> bool {
72        if element.name.local == local_name!("style") {
73            !self.keep_style_tags
74                && element.attributes.get("data-css-inline".into()) != Some("keep")
75        } else if element.name.local == local_name!("link")
76            && element.attributes.get(local_name!("rel")) == Some("stylesheet")
77        {
78            !self.keep_link_tags
79        } else if element.name.local == local_name!("html") {
80            matches!(self.inlining_mode, InliningMode::Fragment)
81        } else {
82            false
83        }
84    }
85
86    fn serialize_children<W: Write>(
87        &self,
88        serializer: &mut HtmlSerializer<'_, W>,
89    ) -> Result<(), InlineError> {
90        for child in self.document.children(self.node) {
91            self.for_node(child).serialize(serializer)?;
92        }
93        Ok(())
94    }
95
96    fn serialize<W: Write>(
97        &self,
98        serializer: &mut HtmlSerializer<'_, W>,
99    ) -> Result<(), InlineError> {
100        match self.data() {
101            NodeData::Element {
102                element,
103                inlining_ignored,
104            } => {
105                if self.should_skip_element(element) {
106                    return Ok(());
107                }
108
109                let style_node_id = if *inlining_ignored {
110                    None
111                } else {
112                    Some(self.node)
113                };
114
115                serializer.start_elem(&element.name, &element.attributes, style_node_id)?;
116
117                self.serialize_children(serializer)?;
118
119                serializer.end_elem(&element.name)?;
120                Ok(())
121            }
122            NodeData::Document => self.serialize_children(serializer),
123            NodeData::Doctype { name } => serializer.write_doctype(name),
124            NodeData::Text { text: content } => serializer.write_text(content),
125            NodeData::Comment { text } => serializer.write_comment(text),
126            NodeData::ProcessingInstruction { target, data } => {
127                serializer.write_processing_instruction(target, data)
128            }
129        }
130    }
131}
132
133struct ElemInfo {
134    html_name: Option<LocalName>,
135    ignore_children: bool,
136}
137
138/// Inspired by HTML serializer from `html5ever`
139/// Source: <https://github.com/servo/html5ever/blob/98d3c0cd01471af997cd60849a38da45a9414dfd/html5ever/src/serialize/mod.rs#L77>
140struct HtmlSerializer<'a, Wr: Write> {
141    writer: Wr,
142    styles: DocumentStyleMap<'a>,
143    stack: Vec<ElemInfo>,
144    style_buffer: SmallVec<[Vec<u8>; 8]>,
145}
146
147impl<'a, W: Write> HtmlSerializer<'a, W> {
148    fn new(writer: W, styles: DocumentStyleMap<'a>) -> Self {
149        let mut stack = Vec::with_capacity(8);
150        stack.push(ElemInfo {
151            html_name: None,
152            ignore_children: false,
153        });
154        HtmlSerializer {
155            writer,
156            styles,
157            stack,
158            style_buffer: smallvec![],
159        }
160    }
161
162    fn parent(&mut self) -> &mut ElemInfo {
163        self.stack.last_mut().expect("no parent ElemInfo")
164    }
165
166    fn write_escaped(&mut self, text: &str) -> Result<(), InlineError> {
167        let mut last_end = 0;
168        for (start, part) in text.match_indices(['&', '\u{00A0}', '<', '>']) {
169            self.writer.write_all(
170                text.get(last_end..start)
171                    .expect("Invalid substring")
172                    .as_bytes(),
173            )?;
174            // This is slightly faster than matching on `char`
175            // Notably, this approach does not work in `write_attributes` below
176            match (part.as_bytes()[0] & 0b0000_1110) >> 1 {
177                1 => self.writer.write_all(b"&nbsp;")?,
178                3 => self.writer.write_all(b"&amp;")?,
179                6 => self.writer.write_all(b"&lt;")?,
180                7 => self.writer.write_all(b"&gt;")?,
181                _ => unreachable!(),
182            }
183            last_end = start.checked_add(part.len()).expect("Size overflow");
184        }
185        self.writer.write_all(
186            text.get(last_end..text.len())
187                .expect("Invalid substring")
188                .as_bytes(),
189        )?;
190        Ok(())
191    }
192
193    fn write_attributes(&mut self, text: &str) -> Result<(), InlineError> {
194        let mut last_end = 0;
195        for (start, part) in text.match_indices(['&', '\u{00A0}', '"']) {
196            self.writer.write_all(
197                text.get(last_end..start)
198                    .expect("Invalid substring")
199                    .as_bytes(),
200            )?;
201            match part {
202                "&" => self.writer.write_all(b"&amp;")?,
203                "\u{00A0}" => self.writer.write_all(b"&nbsp;")?,
204                "\"" => self.writer.write_all(b"&quot;")?,
205                _ => unreachable!("Only the variants above are searched"),
206            }
207            last_end = start.checked_add(part.len()).expect("Size overflow");
208        }
209        self.writer.write_all(
210            text.get(last_end..text.len())
211                .expect("Invalid substring")
212                .as_bytes(),
213        )?;
214        Ok(())
215    }
216
217    fn start_elem(
218        &mut self,
219        name: &QualName,
220        attrs: &Attributes,
221        style_node_id: Option<NodeId>,
222    ) -> Result<(), InlineError> {
223        let html_name = match name.ns {
224            ns!(html) => Some(name.local.clone()),
225            _ => None,
226        };
227
228        if self.parent().ignore_children {
229            self.stack.push(ElemInfo {
230                html_name,
231                ignore_children: true,
232            });
233            return Ok(());
234        }
235
236        let mut styles = if let Some(node_id) = style_node_id {
237            self.styles.swap_remove(&node_id).map(|mut styles| {
238                styles.sort_unstable_by(|_, (a, _), _, (b, _)| a.cmp(b));
239                styles
240            })
241        } else {
242            None
243        };
244
245        self.writer.write_all(b"<")?;
246        self.writer.write_all(name.local.as_bytes())?;
247        if let Some(class) = &attrs.class {
248            self.writer.write_all(b" class=\"")?;
249            self.writer.write_all(class.value.as_bytes())?;
250            self.writer.write_all(b"\"")?;
251        }
252        for attr in &attrs.attributes {
253            self.writer.write_all(b" ")?;
254
255            match attr.name.ns {
256                ns!() => (),
257                ns!(xml) => self.writer.write_all(b"xml:")?,
258                ns!(xmlns) => {
259                    if attr.name.local != local_name!("xmlns") {
260                        self.writer.write_all(b"xmlns:")?;
261                    }
262                }
263                ns!(xlink) => self.writer.write_all(b"xlink:")?,
264                _ => {
265                    self.writer.write_all(b"unknown_namespace:")?;
266                }
267            }
268
269            self.writer.write_all(attr.name.local.as_bytes())?;
270            self.writer.write_all(b"=\"")?;
271            if attr.name.local.as_bytes() == b"style" {
272                if let Some(new_styles) = &styles {
273                    merge_styles(
274                        &mut self.writer,
275                        &attr.value,
276                        new_styles,
277                        &mut self.style_buffer,
278                    )?;
279                    styles = None;
280                } else {
281                    self.write_attributes(&attr.value)?;
282                }
283            } else {
284                self.write_attributes(&attr.value)?;
285            }
286            self.writer.write_all(b"\"")?;
287        }
288        if let Some(styles) = &styles {
289            self.writer.write_all(b" style=\"")?;
290            for (property, (_, value)) in styles {
291                write_declaration(&mut self.writer, property, value)?;
292                self.writer.write_all(b";")?;
293            }
294            self.writer.write_all(b"\"")?;
295        }
296        self.writer.write_all(b">")?;
297
298        let ignore_children = name.ns == ns!(html)
299            && matches!(
300                name.local,
301                local_name!("area")
302                    | local_name!("base")
303                    | local_name!("basefont")
304                    | local_name!("bgsound")
305                    | local_name!("br")
306                    | local_name!("col")
307                    | local_name!("embed")
308                    | local_name!("frame")
309                    | local_name!("hr")
310                    | local_name!("img")
311                    | local_name!("input")
312                    | local_name!("keygen")
313                    | local_name!("link")
314                    | local_name!("meta")
315                    | local_name!("param")
316                    | local_name!("source")
317                    | local_name!("track")
318                    | local_name!("wbr")
319            );
320
321        self.stack.push(ElemInfo {
322            html_name,
323            ignore_children,
324        });
325
326        Ok(())
327    }
328
329    fn end_elem(&mut self, name: &QualName) -> Result<(), InlineError> {
330        let Some(info) = self.stack.pop() else {
331            panic!("no ElemInfo")
332        };
333        if info.ignore_children {
334            return Ok(());
335        }
336
337        self.writer.write_all(b"</")?;
338        self.writer.write_all(name.local.as_bytes())?;
339        self.writer.write_all(b">")?;
340        Ok(())
341    }
342
343    fn write_text(&mut self, text: &str) -> Result<(), InlineError> {
344        let escape = !matches!(
345            self.parent().html_name,
346            Some(
347                local_name!("style")
348                    | local_name!("script")
349                    | local_name!("xmp")
350                    | local_name!("iframe")
351                    | local_name!("noembed")
352                    | local_name!("noframes")
353                    | local_name!("plaintext")
354                    | local_name!("noscript")
355            ),
356        );
357
358        if escape {
359            self.write_escaped(text)?;
360        } else {
361            self.writer.write_all(text.as_bytes())?;
362        }
363        Ok(())
364    }
365
366    fn write_comment(&mut self, text: &str) -> Result<(), InlineError> {
367        self.writer.write_all(b"<!--")?;
368        self.writer.write_all(text.as_bytes())?;
369        self.writer.write_all(b"-->")?;
370        Ok(())
371    }
372
373    fn write_doctype(&mut self, name: &str) -> Result<(), InlineError> {
374        self.writer.write_all(b"<!DOCTYPE ")?;
375        self.writer.write_all(name.as_bytes())?;
376        self.writer.write_all(b">")?;
377        Ok(())
378    }
379
380    fn write_processing_instruction(
381        &mut self,
382        target: &str,
383        data: &str,
384    ) -> Result<(), InlineError> {
385        self.writer.write_all(b"<?")?;
386        self.writer.write_all(target.as_bytes())?;
387        self.writer.write_all(b" ")?;
388        self.writer.write_all(data.as_bytes())?;
389        self.writer.write_all(b">")?;
390        Ok(())
391    }
392}
393
394const STYLE_SEPARATOR: &[u8] = b": ";
395
396#[inline]
397fn write_declaration<Wr: Write>(
398    writer: &mut Wr,
399    name: &str,
400    value: &str,
401) -> Result<(), InlineError> {
402    writer.write_all(name.as_bytes())?;
403    writer.write_all(STYLE_SEPARATOR)?;
404    write_declaration_value(writer, value)
405}
406
407#[inline]
408fn write_declaration_value<Wr: Write>(writer: &mut Wr, value: &str) -> Result<(), InlineError> {
409    let value = value.trim();
410    if value.as_bytes().contains(&b'"') {
411        // Roughly based on `str::replace`
412        let mut last_end = 0;
413        for (start, part) in value.match_indices('"') {
414            writer.write_all(
415                value
416                    .get(last_end..start)
417                    .expect("Invalid substring")
418                    .as_bytes(),
419            )?;
420            writer.write_all(b"'")?;
421            last_end = start.checked_add(part.len()).expect("Size overflow");
422        }
423        writer.write_all(
424            value
425                .get(last_end..value.len())
426                .expect("Invalid substring")
427                .as_bytes(),
428        )?;
429    } else {
430        writer.write_all(value.as_bytes())?;
431    }
432    Ok(())
433}
434
435macro_rules! push_or_update {
436    ($style_buffer:expr, $length:expr, $name: expr, $value:expr) => {{
437        if let Some(style) = $style_buffer.get_mut($length) {
438            style.clear();
439            write_declaration(style, &$name, $value)?;
440        } else {
441            let value = $value.trim();
442            let mut style = Vec::with_capacity(
443                $name
444                    .len()
445                    .saturating_add(STYLE_SEPARATOR.len())
446                    .saturating_add(value.len()),
447            );
448            write_declaration(&mut style, &$name, $value)?;
449            $style_buffer.push(style);
450        };
451        $length = $length.saturating_add(1);
452    }};
453}
454
455/// Merge a new set of styles into an current one, considering the rules of CSS precedence.
456///
457/// The merge process maintains the order of specificity and respects the `!important` rule in CSS.
458fn merge_styles<Wr: Write>(
459    writer: &mut Wr,
460    current_style: &StrTendril,
461    new_styles: &ElementStyleMap<'_>,
462    declarations_buffer: &mut SmallVec<[Vec<u8>; 8]>,
463) -> Result<(), InlineError> {
464    // This function is designed with a focus on reusing existing allocations where possible
465    // We start by parsing the current declarations in the "style" attribute
466    let mut parser_input = cssparser::ParserInput::new(current_style);
467    let mut parser = cssparser::Parser::new(&mut parser_input);
468    let mut declaration_parser = parser::CSSDeclarationListParser;
469    let current_declarations = cssparser::RuleBodyParser::new(&mut parser, &mut declaration_parser);
470    // We manually manage the length of our buffer. The buffer may contain slots used
471    // in previous runs, and we want to access only the portion that we build in this iteration
472    let mut parsed_declarations_count: usize = 0;
473    for (idx, declaration) in current_declarations.enumerate() {
474        parsed_declarations_count = parsed_declarations_count.saturating_add(1);
475        let (property, value) = declaration?;
476        let estimated_declaration_size = property
477            .len()
478            .saturating_add(STYLE_SEPARATOR.len())
479            .saturating_add(value.len());
480        // We store the existing style declarations in the buffer for later merging with new styles
481        // If possible, we reuse existing slots in the buffer to avoid additional allocations
482        if let Some(buffer) = declarations_buffer.get_mut(idx) {
483            buffer.clear();
484            buffer.reserve(estimated_declaration_size);
485            write_declaration(buffer, &property, value)?;
486        } else {
487            let mut buffer = Vec::with_capacity(estimated_declaration_size);
488            write_declaration(&mut buffer, &property, value)?;
489            declarations_buffer.push(buffer);
490        }
491    }
492    // Keep the number of current declarations to write them last as they have the precedence
493    let current_declarations_count = parsed_declarations_count;
494    // Next, we iterate over the new styles and merge them into our existing set
495    // New rules will not override old ones unless they are marked as `!important`
496    for (property, (_, value)) in new_styles {
497        match (
498            value.strip_suffix("!important"),
499            declarations_buffer
500                .iter_mut()
501                .take(parsed_declarations_count)
502                .find(|style| {
503                    style.starts_with(property.as_bytes())
504                        && style.get(property.len()..=property.len().saturating_add(1))
505                            == Some(STYLE_SEPARATOR)
506                }),
507        ) {
508            // The new rule is `!important` and there's an existing rule with the same name
509            // In this case, we override the existing rule with the new one
510            (Some(value), Some(buffer)) => {
511                // We keep the rule name and the colon-space suffix - '<rule>: `
512                buffer.truncate(property.len().saturating_add(STYLE_SEPARATOR.len()));
513                write_declaration_value(buffer, value)?;
514            }
515            // There's no existing rule with the same name, but the new rule is `!important`
516            // In this case, we add the new rule with the `!important` suffix removed
517            (Some(value), None) => {
518                push_or_update!(
519                    declarations_buffer,
520                    parsed_declarations_count,
521                    property,
522                    value
523                );
524            }
525            // There's no existing rule with the same name, and the new rule is not `!important`
526            // In this case, we just add the new rule as-is
527            (None, None) => push_or_update!(
528                declarations_buffer,
529                parsed_declarations_count,
530                property,
531                value
532            ),
533            // Rule exists and the new one is not `!important` - leave the existing rule as-is and
534            // ignore the new one.
535            (None, Some(_)) => {}
536        }
537    }
538
539    let mut first = true;
540    for range in [
541        // First, write the new rules
542        current_declarations_count..parsed_declarations_count,
543        // Then, write the current rules
544        0..current_declarations_count,
545    ] {
546        for declaration in &declarations_buffer[range] {
547            if first {
548                first = false;
549            } else {
550                writer.write_all(b";")?;
551            }
552            writer.write_all(declaration)?;
553        }
554    }
555    Ok(())
556}
557
558#[cfg(test)]
559mod tests {
560    use crate::html::InliningMode;
561
562    use super::Document;
563    use indexmap::IndexMap;
564
565    #[test]
566    fn test_serialize() {
567        let doc = Document::parse_with_options(
568            b"<html><head><style>h1 { color:blue; }</style><style>h1 { color:red }</style></head>",
569            0,
570            InliningMode::Document,
571        );
572        let mut buffer = Vec::new();
573        doc.serialize(
574            &mut buffer,
575            IndexMap::default(),
576            true,
577            false,
578            InliningMode::Document,
579        )
580        .expect("Should not fail");
581        assert_eq!(buffer, b"<html><head><style>h1 { color:blue; }</style><style>h1 { color:red }</style></head><body></body></html>");
582    }
583
584    #[test]
585    fn test_skip_style_tags() {
586        let doc = Document::parse_with_options(
587            b"<html><head><style>h1 { color:blue; }</style><style>h1 { color:red }</style></head>",
588            0,
589            InliningMode::Document,
590        );
591        let mut buffer = Vec::new();
592        doc.serialize(
593            &mut buffer,
594            IndexMap::default(),
595            false,
596            false,
597            InliningMode::Document,
598        )
599        .expect("Should not fail");
600        assert_eq!(buffer, b"<html><head></head><body></body></html>");
601    }
602
603    #[test]
604    fn test_escaped() {
605        let doc = Document::parse_with_options(
606            b"<!DOCTYPE html><html><head><title>& < > \xC2\xA0</title></head><body></body></html>",
607            0,
608            InliningMode::Document,
609        );
610        let mut buffer = Vec::new();
611        doc.serialize(
612            &mut buffer,
613            IndexMap::default(),
614            false,
615            false,
616            InliningMode::Document,
617        )
618        .expect("Should not fail");
619        assert_eq!(buffer, b"<!DOCTYPE html><html><head><title>&amp; &lt; &gt; &nbsp;</title></head><body></body></html>");
620    }
621
622    #[test]
623    fn test_untouched_style() {
624        let doc = Document::parse_with_options(
625            b"<html><body><p style=\"color:blue;\"></p></body></html>",
626            0,
627            InliningMode::Document,
628        );
629        let mut buffer = Vec::new();
630        doc.serialize(
631            &mut buffer,
632            IndexMap::default(),
633            false,
634            false,
635            InliningMode::Document,
636        )
637        .expect("Should not fail");
638        assert_eq!(
639            buffer,
640            b"<html><head></head><body><p style=\"color:blue;\"></p></body></html>"
641        );
642    }
643
644    #[test]
645    fn test_attributes() {
646        let doc = Document::parse_with_options(
647            b"<!DOCTYPE html><html><head></head><body data-foo='& \xC2\xA0 \"'></body></html>",
648            0,
649            InliningMode::Document,
650        );
651        let mut buffer = Vec::new();
652        doc.serialize(
653            &mut buffer,
654            IndexMap::default(),
655            false,
656            false,
657            InliningMode::Document,
658        )
659        .expect("Should not fail");
660        assert_eq!(buffer, b"<!DOCTYPE html><html><head></head><body data-foo=\"&amp; &nbsp; &quot;\"></body></html>");
661    }
662}