lol_html/parser/tree_builder_simulator/
ambiguity_guard.rs

1//! There are few ambigious cases where we can't determine correct
2//! parsing context having a limited information about the current
3//! state of tree builder. This caused issues in the past where
4//! Cloudflare's security features were used as XSS gadgets
5//! (see <https://portswigger.net/blog/when-security-features-collide>).
6//! Therefore, due to these safety concerns in such cases we prefer
7//! to bail out from tokenization process.
8//!
9//! In tree builder simulation we need to switch parser to one
10//! of standalone text parsing state machines if we encounter some
11//! specific tags. E.g. if we encounter `<script>` start tag we should
12//! treat all content up to the closing `</script>` tag as text.
13//! Without having a full-featured tree construction stage there is way
14//! to trick parser into parsing content that has actual tags in it
15//! as text. E.g. by putting `<script>` start tag into context where
16//! it will be ignored.
17//!
18//! There are just a few tree builder insertion modes in which text
19//! parsing mode switching start tags can be ignored: in `<select>` and in
20//! or after `<frameset>`.
21//!
22//! There are numerous not so obvious ways to get into or get out of these
23//! insertion modes. So, for safety reasons we try to be pro-active here
24//! and just bailout in case if we see text parsing mode switching start tags
25//! between `<select>` start and end tag, or anywhere after the `<frameset>`
26//! start tag. These cases shouldn't trigger bailout for any *conforming*
27//! markup.
28//!
29//! However, there is a case where bailout could happen even with conforming
30//! markup: if we encounter text parsing mode switching start tag in `<template>`
31//! which is inside `<select>` element content. Unfortunately, rules required
32//! to track template parsing context are way to complicated in such a case
33//! and will require an implementation of the significant part of the tree
34//! construction state. Though, current assumption is that markup that can
35//! trigger this bailout case should be seen quite rarely in the wild.
36use crate::html::{LocalNameHash, Tag};
37use std::fmt::{self, Display};
38use thiserror::Error;
39
40/// An error that occurs when HTML parser runs into an ambigious state in the [`strict`] mode.
41///
42/// Since the rewriter operates on a token stream and doesn't have access to a full
43/// DOM-tree, there are certain rare cases of non-conforming HTML markup which can't be
44/// guaranteed to be parsed correctly without an ability to backtrace the tree.
45///
46/// Therefore, due to security considerations, sometimes it's preferable to abort the
47/// rewriting process in case of such uncertainty.
48///
49/// One of the simplest examples of such markup is the following:
50///
51/// ```html
52/// ...
53/// <select><xmp><script>"use strict";</script></select>
54/// ...
55/// ```
56///
57/// The `<xmp>` element is not allowed inside the `<select>` element, so in a browser the start
58/// tag for `<xmp>` will be ignored and following `<script>` element will be parsed and executed.
59///
60/// On the other hand, the `<select>` element itself can be also ignored depending on the
61/// context in which it was parsed. In this case, the `<xmp>` element will not be ignored
62/// and the `<script>` element along with its content will be parsed as a simple text inside
63/// it.
64///
65/// So, in this case the parser needs an ability to backtrace the DOM-tree to figure out the
66/// correct parsing context.
67///
68/// [`strict`]: ../struct.Settings.html#structfield.strict
69#[derive(Error, Debug, Eq, PartialEq)]
70pub struct ParsingAmbiguityError {
71    on_tag_name: Box<str>,
72}
73
74impl Display for ParsingAmbiguityError {
75    #[cold]
76    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
77        write!(
78            f,
79            concat!(
80            "The parser has encountered a text content tag (`<{}>`) in the context where it is ",
81            "ambiguous whether this tag should be ignored or not. And, thus, is is unclear is ",
82            "consequent content should be parsed as raw text or HTML markup.",
83            "\n\n",
84            "This error occurs due to the limited capabilities of the streaming parsing. However, ",
85            "almost all of the cases of this error are caused by a non-conforming markup (e.g. a ",
86            "`<script>` element in `<select>` element)."
87        ),
88            self.on_tag_name
89        )
90    }
91}
92
93// NOTE: use macro for the assertion function definition, so we can
94// provide ambiguity error with a string representation of the tag
95// name without a necessity to implement conversion from u64 tag name
96// hash to a string. This also allows us to be consistent about asserted
97// tag name hashes and the corresponding tag name strings.
98macro_rules! create_assert_for_tags {
99    ( $($tag:ident),+ ) => {
100        #[cold]
101        fn tag_hash_to_string(tag_name: LocalNameHash) -> Box<str> {
102            let s = match tag_name {
103                $(t if t == Tag::$tag => stringify!($tag),)+
104                _ => "no string representation",
105            };
106            s.to_ascii_lowercase().into_boxed_str()
107        }
108
109        #[inline]
110        fn assert_not_ambigious_text_type_switch(
111            tag_name: LocalNameHash,
112        ) -> Result<(), ParsingAmbiguityError> {
113            if tag_is_one_of!(tag_name, [ $($tag),+ ]) {
114                Err(ParsingAmbiguityError {
115                    on_tag_name: tag_hash_to_string(tag_name)
116                })
117            } else {
118                Ok(())
119            }
120        }
121    };
122}
123
124create_assert_for_tags!(
125    Textarea, Title, Plaintext, Script, Style, Iframe, Xmp, Noembed, Noframes, Noscript
126);
127
128#[derive(Copy, Clone)]
129enum State {
130    Default,
131    InSelect,
132    InTemplateInSelect(u64),
133    InOrAfterFrameset,
134}
135
136pub(crate) struct AmbiguityGuard {
137    state: State,
138}
139
140impl Default for AmbiguityGuard {
141    fn default() -> Self {
142        Self {
143            state: State::Default,
144        }
145    }
146}
147
148impl AmbiguityGuard {
149    pub fn track_start_tag(
150        &mut self,
151        tag_name: LocalNameHash,
152    ) -> Result<(), ParsingAmbiguityError> {
153        match self.state {
154            State::Default => {
155                if tag_name == Tag::Select {
156                    self.state = State::InSelect;
157                } else if tag_name == Tag::Frameset {
158                    self.state = State::InOrAfterFrameset;
159                }
160            }
161            State::InSelect => {
162                // NOTE: these start tags cause premature exit
163                // from "in select" insertion mode.
164                if tag_is_one_of!(tag_name, [Select, Textarea, Input, Keygen]) {
165                    self.state = State::Default;
166                } else if tag_name == Tag::Template {
167                    self.state = State::InTemplateInSelect(1);
168                }
169                // NOTE: <script> is allowed in "in select" insertion mode.
170                else if tag_name != Tag::Script {
171                    assert_not_ambigious_text_type_switch(tag_name)?;
172                }
173            }
174            State::InTemplateInSelect(depth) => {
175                if tag_name == Tag::Template {
176                    self.state = State::InTemplateInSelect(depth + 1);
177                } else {
178                    assert_not_ambigious_text_type_switch(tag_name)?;
179                }
180            }
181            State::InOrAfterFrameset => {
182                // NOTE: <noframes> is allowed in and after <frameset>.
183                if tag_name != Tag::Noframes {
184                    assert_not_ambigious_text_type_switch(tag_name)?;
185                }
186            }
187        }
188
189        Ok(())
190    }
191
192    pub fn track_end_tag(&mut self, tag_name: LocalNameHash) {
193        match self.state {
194            State::InSelect if tag_name == Tag::Select => {
195                self.state = State::Default;
196            }
197            State::InTemplateInSelect(depth) if tag_name == Tag::Template => {
198                self.state = if depth == 1 {
199                    State::InSelect
200                } else {
201                    State::InTemplateInSelect(depth - 1)
202                }
203            }
204            _ => (),
205        }
206    }
207}