lol_html/parser/tree_builder_simulator/ambiguity_guard.rs
1//! There are few ambigious cases where we can't determine correct
2//! parsing context having a limited information about the current
3//! state of tree builder. This caused issues in the past where
4//! Cloudflare's security features were used as XSS gadgets
5//! (see <https://portswigger.net/blog/when-security-features-collide>).
6//! Therefore, due to these safety concerns in such cases we prefer
7//! to bail out from tokenization process.
8//!
9//! In tree builder simulation we need to switch parser to one
10//! of standalone text parsing state machines if we encounter some
11//! specific tags. E.g. if we encounter `<script>` start tag we should
12//! treat all content up to the closing `</script>` tag as text.
13//! Without having a full-featured tree construction stage there is way
14//! to trick parser into parsing content that has actual tags in it
15//! as text. E.g. by putting `<script>` start tag into context where
16//! it will be ignored.
17//!
18//! There are just a few tree builder insertion modes in which text
19//! parsing mode switching start tags can be ignored: in `<select>` and in
20//! or after `<frameset>`.
21//!
22//! There are numerous not so obvious ways to get into or get out of these
23//! insertion modes. So, for safety reasons we try to be pro-active here
24//! and just bailout in case if we see text parsing mode switching start tags
25//! between `<select>` start and end tag, or anywhere after the `<frameset>`
26//! start tag. These cases shouldn't trigger bailout for any *conforming*
27//! markup.
28//!
29//! However, there is a case where bailout could happen even with conforming
30//! markup: if we encounter text parsing mode switching start tag in `<template>`
31//! which is inside `<select>` element content. Unfortunately, rules required
32//! to track template parsing context are way to complicated in such a case
33//! and will require an implementation of the significant part of the tree
34//! construction state. Though, current assumption is that markup that can
35//! trigger this bailout case should be seen quite rarely in the wild.
36use crate::html::{LocalNameHash, Tag};
37use std::fmt::{self, Display};
38use thiserror::Error;
39
40/// An error that occurs when HTML parser runs into an ambigious state in the [`strict`] mode.
41///
42/// Since the rewriter operates on a token stream and doesn't have access to a full
43/// DOM-tree, there are certain rare cases of non-conforming HTML markup which can't be
44/// guaranteed to be parsed correctly without an ability to backtrace the tree.
45///
46/// Therefore, due to security considerations, sometimes it's preferable to abort the
47/// rewriting process in case of such uncertainty.
48///
49/// One of the simplest examples of such markup is the following:
50///
51/// ```html
52/// ...
53/// <select><xmp><script>"use strict";</script></select>
54/// ...
55/// ```
56///
57/// The `<xmp>` element is not allowed inside the `<select>` element, so in a browser the start
58/// tag for `<xmp>` will be ignored and following `<script>` element will be parsed and executed.
59///
60/// On the other hand, the `<select>` element itself can be also ignored depending on the
61/// context in which it was parsed. In this case, the `<xmp>` element will not be ignored
62/// and the `<script>` element along with its content will be parsed as a simple text inside
63/// it.
64///
65/// So, in this case the parser needs an ability to backtrace the DOM-tree to figure out the
66/// correct parsing context.
67///
68/// [`strict`]: ../struct.Settings.html#structfield.strict
69#[derive(Error, Debug, Eq, PartialEq)]
70pub struct ParsingAmbiguityError {
71 on_tag_name: Box<str>,
72}
73
74impl Display for ParsingAmbiguityError {
75 #[cold]
76 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
77 write!(
78 f,
79 concat!(
80 "The parser has encountered a text content tag (`<{}>`) in the context where it is ",
81 "ambiguous whether this tag should be ignored or not. And, thus, is is unclear is ",
82 "consequent content should be parsed as raw text or HTML markup.",
83 "\n\n",
84 "This error occurs due to the limited capabilities of the streaming parsing. However, ",
85 "almost all of the cases of this error are caused by a non-conforming markup (e.g. a ",
86 "`<script>` element in `<select>` element)."
87 ),
88 self.on_tag_name
89 )
90 }
91}
92
93// NOTE: use macro for the assertion function definition, so we can
94// provide ambiguity error with a string representation of the tag
95// name without a necessity to implement conversion from u64 tag name
96// hash to a string. This also allows us to be consistent about asserted
97// tag name hashes and the corresponding tag name strings.
98macro_rules! create_assert_for_tags {
99 ( $($tag:ident),+ ) => {
100 #[cold]
101 fn tag_hash_to_string(tag_name: LocalNameHash) -> Box<str> {
102 let s = match tag_name {
103 $(t if t == Tag::$tag => stringify!($tag),)+
104 _ => "no string representation",
105 };
106 s.to_ascii_lowercase().into_boxed_str()
107 }
108
109 #[inline]
110 fn assert_not_ambigious_text_type_switch(
111 tag_name: LocalNameHash,
112 ) -> Result<(), ParsingAmbiguityError> {
113 if tag_is_one_of!(tag_name, [ $($tag),+ ]) {
114 Err(ParsingAmbiguityError {
115 on_tag_name: tag_hash_to_string(tag_name)
116 })
117 } else {
118 Ok(())
119 }
120 }
121 };
122}
123
124create_assert_for_tags!(
125 Textarea, Title, Plaintext, Script, Style, Iframe, Xmp, Noembed, Noframes, Noscript
126);
127
128#[derive(Copy, Clone)]
129enum State {
130 Default,
131 InSelect,
132 InTemplateInSelect(u64),
133 InOrAfterFrameset,
134}
135
136pub(crate) struct AmbiguityGuard {
137 state: State,
138}
139
140impl Default for AmbiguityGuard {
141 fn default() -> Self {
142 Self {
143 state: State::Default,
144 }
145 }
146}
147
148impl AmbiguityGuard {
149 pub fn track_start_tag(
150 &mut self,
151 tag_name: LocalNameHash,
152 ) -> Result<(), ParsingAmbiguityError> {
153 match self.state {
154 State::Default => {
155 if tag_name == Tag::Select {
156 self.state = State::InSelect;
157 } else if tag_name == Tag::Frameset {
158 self.state = State::InOrAfterFrameset;
159 }
160 }
161 State::InSelect => {
162 // NOTE: these start tags cause premature exit
163 // from "in select" insertion mode.
164 if tag_is_one_of!(tag_name, [Select, Textarea, Input, Keygen]) {
165 self.state = State::Default;
166 } else if tag_name == Tag::Template {
167 self.state = State::InTemplateInSelect(1);
168 }
169 // NOTE: <script> is allowed in "in select" insertion mode.
170 else if tag_name != Tag::Script {
171 assert_not_ambigious_text_type_switch(tag_name)?;
172 }
173 }
174 State::InTemplateInSelect(depth) => {
175 if tag_name == Tag::Template {
176 self.state = State::InTemplateInSelect(depth + 1);
177 } else {
178 assert_not_ambigious_text_type_switch(tag_name)?;
179 }
180 }
181 State::InOrAfterFrameset => {
182 // NOTE: <noframes> is allowed in and after <frameset>.
183 if tag_name != Tag::Noframes {
184 assert_not_ambigious_text_type_switch(tag_name)?;
185 }
186 }
187 }
188
189 Ok(())
190 }
191
192 pub fn track_end_tag(&mut self, tag_name: LocalNameHash) {
193 match self.state {
194 State::InSelect if tag_name == Tag::Select => {
195 self.state = State::Default;
196 }
197 State::InTemplateInSelect(depth) if tag_name == Tag::Template => {
198 self.state = if depth == 1 {
199 State::InSelect
200 } else {
201 State::InTemplateInSelect(depth - 1)
202 }
203 }
204 _ => (),
205 }
206 }
207}