rustextile/
htmltools.rs

1use std::borrow::Cow;
2use std::collections::HashMap;
3use std::str::FromStr;
4
5use lazy_static::lazy_static;
6use fancy_regex::{Regex, Captures};
7
8use crate::html::HTML5;
9use crate::regextra::fregex;
10use crate::regex_snips::{BLOCK_CONTENT, DIVIDER_RE};
11
12pub(crate) fn encode_html(text: &str, quotes: bool, line_spacers: bool) -> String {
13    let mut result = String::with_capacity(2 * text.len());
14    let pattern = if quotes {
15        if line_spacers {
16            &['&', '<', '>', '"', '\'', '\n', '\r', '\t'][..]
17        } else {
18            &['&', '<', '>', '"', '\''][..]
19        }
20    } else if line_spacers {
21        &['&', '<', '>', '\n', '\r', '\t'][..]
22    } else {
23        &['&', '<', '>'][..]
24    };
25    let mut leftover = text;
26    while let Some(sep_index) = leftover.find(pattern) {
27        result.push_str(&leftover[0..sep_index]);
28        let sep = &leftover[sep_index..sep_index+1];
29        let replacement = match sep {
30            "&" => "&amp;",
31            "<" => "&lt;",
32            ">" => "&gt;",
33            "\"" => "&quot;",
34            "'" => "&#39;",
35            "\n" => "&#13;",
36            "\r" => "&#10;",
37            "\t" => "&#9;",
38            _ => unreachable!("An impossible symbol to encode: {}", sep)
39        };
40        result.push_str(replacement);
41        leftover = &leftover[sep_index + 1..];
42    }
43    result.push_str(leftover);
44    result
45}
46
47pub(crate) fn reverse_encode_html(text: &str) -> Cow<str> {
48    lazy_static! {
49        static ref ENTITY_RE: Regex = fregex!(
50            "(&(?:amp|lt|gt|quot|#39|#13|#10|#9);)");
51    }
52    ENTITY_RE.replace_all(text, |cap: &Captures| {
53        let entity = &cap[1];
54        match entity {
55            "&lt;" => "<",
56            "&gt;" => ">",
57            "&quot;" => "\"",
58            "&#39;" => "'",
59            "&#13;" => "\n",
60            "&#10;" => "\r",
61            "&#9;" => "\t",
62            _ => unreachable!("Entity {entity:#?} must be part of the regular expression")
63        }
64    })
65}
66
67/// Escapes and quotes an XML/HTML attribute value.
68/// Functional analog of xml.sax.saxutils.quoteattr from Python3
69pub(crate) fn quoteattr(data: &str) -> String {
70    let data = encode_html(data, false, true);
71    if data.contains('"') {
72        if data.contains('\'') {
73            format!("\"{}\"", data.replace('"', "&quot;"))
74        } else {
75            format!("'{}'", data)
76        }
77    } else {
78        format!("\"{}\"", data)
79    }
80}
81
82// Based on [the latest HTML standard](https://html.spec.whatwg.org/multipage/syntax.html#attributes-2)
83fn is_valid_attribute_char(c: char) -> bool {
84    !(c.is_control()
85      || c.is_whitespace()
86      || ('\u{FDD0}'..='\u{FDEF}').contains(&c)
87      || c == '='
88      || c == '/'
89      || c == '>'
90      || c == '"'
91      || c == '\'')
92}
93
94
95pub(crate) fn join_html_attributes(result: &mut String, attributes: &[(String, String)]) {
96    let valid_attrs = attributes.iter().filter(|(name, _)| name.chars().all(is_valid_attribute_char));
97    for (aname, avalue) in valid_attrs {
98        result.push(' ');
99        result.push_str(aname);
100        result.push('=');
101        result.push_str(&quoteattr(avalue));
102    }
103}
104
105pub(crate) trait AsOptionStr {
106    fn as_option_str(&self) -> Option<&str>;
107}
108
109impl AsOptionStr for &Option<String> {
110    fn as_option_str(&self) -> Option<&str> {
111        self.as_deref()
112    }
113}
114
115impl AsOptionStr for &str {
116    fn as_option_str(&self) -> Option<&str> {
117        Some(*self)
118    }
119}
120
121impl AsOptionStr for &String {
122    fn as_option_str(&self) -> Option<&str> {
123        Some(self.as_str())
124    }
125}
126
127// Generates a complete HTML tag with a given name, attributes and content.
128// Any of the attributes containing "illegal" characters won't be added.
129// If the tag`s name contains invalid characters, whole content will be "safed"
130// (by `encoded_html`) and returned instead.
131pub(crate) fn generate_tag<S>(
132    tag: S, content: Option<&str>, attributes: &[(String, String)]
133) -> String
134    where S: AsOptionStr
135{
136    if let Some(tag) = tag.as_option_str() {
137        if tag.is_empty() {
138            return content.unwrap_or_default().to_owned();
139        }
140        if !tag.chars().all(char::is_alphanumeric) {
141            return encode_html(content.unwrap_or_default(), true, false);
142        }
143
144        let mut result = String::from("<") + tag;
145        join_html_attributes(&mut result, attributes);
146        match content {
147            Some(text) => {
148                result.push('>');
149                result.push_str(text);
150                result.push_str("</");
151                result.push_str(tag);
152                result.push('>');
153            },
154            None => {
155                result.push_str(" />");
156            },
157        }
158        result
159    } else {
160        content.unwrap_or_default().to_owned()
161    }
162}
163
164lazy_static! {
165    static ref INVALID_CHARREFS: HashMap<u32, char> = HashMap::from([
166        (0x00, '\u{fffd}'),  // REPLACEMENT CHARACTER
167        (0x0d, '\r'),      // CARRIAGE RETURN
168        (0x80, '\u{20ac}'),  // EURO SIGN
169        (0x81, '\u{81}'),    // <control>
170        (0x82, '\u{201a}'),  // SINGLE LOW-9 QUOTATION MARK
171        (0x83, '\u{0192}'),  // LATIN SMALL LETTER F WITH HOOK
172        (0x84, '\u{201e}'),  // DOUBLE LOW-9 QUOTATION MARK
173        (0x85, '\u{2026}'),  // HORIZONTAL ELLIPSIS
174        (0x86, '\u{2020}'),  // DAGGER
175        (0x87, '\u{2021}'),  // DOUBLE DAGGER
176        (0x88, '\u{02c6}'),  // MODIFIER LETTER CIRCUMFLEX ACCENT
177        (0x89, '\u{2030}'),  // PER MILLE SIGN
178        (0x8a, '\u{0160}'),  // LATIN CAPITAL LETTER S WITH CARON
179        (0x8b, '\u{2039}'),  // SINGLE LEFT-POINTING ANGLE QUOTATION MARK
180        (0x8c, '\u{0152}'),  // LATIN CAPITAL LIGATURE OE
181        (0x8d, '\u{8d}'),    // <control>
182        (0x8e, '\u{017d}'),  // LATIN CAPITAL LETTER Z WITH CARON
183        (0x8f, '\u{8f}'),    // <control>
184        (0x90, '\u{90}'),    // <control>
185        (0x91, '\u{2018}'),  // LEFT SINGLE QUOTATION MARK
186        (0x92, '\u{2019}'),  // RIGHT SINGLE QUOTATION MARK
187        (0x93, '\u{201c}'),  // LEFT DOUBLE QUOTATION MARK
188        (0x94, '\u{201d}'),  // RIGHT DOUBLE QUOTATION MARK
189        (0x95, '\u{2022}'),  // BULLET
190        (0x96, '\u{2013}'),  // EN DASH
191        (0x97, '\u{2014}'),  // EM DASH
192        (0x98, '\u{02dc}'),  // SMALL TILDE
193        (0x99, '\u{2122}'),  // TRADE MARK SIGN
194        (0x9a, '\u{0161}'),  // LATIN SMALL LETTER S WITH CARON
195        (0x9b, '\u{203a}'),  // SINGLE RIGHT-POINTING ANGLE QUOTATION MARK
196        (0x9c, '\u{0153}'),  // LATIN SMALL LIGATURE OE
197        (0x9d, '\u{9d}'),    // <control>
198        (0x9e, '\u{017e}'),  // LATIN SMALL LETTER Z WITH CARON
199        (0x9f, '\u{0178}'),  // LATIN CAPITAL LETTER Y WITH DIAERESIS
200    ]);
201
202}
203
204fn is_invalid_codepoint(cp: u32) -> bool {
205    matches!(cp,
206        0x0001..=0x0008 | 0x000E..=0x001F | 0x007F..=0x009F | 0xFDD0..=0xFDEF
207        | 0xb | 0xfffe | 0xffff | 0x1fffe | 0x1ffff | 0x2fffe | 0x2ffff
208        | 0x3fffe | 0x3ffff | 0x4fffe | 0x4ffff | 0x5fffe |  0x5ffff
209        |  0x6fffe |  0x6ffff |  0x7fffe |  0x7ffff | 0x8fffe |  0x8ffff
210        |  0x9fffe |  0x9ffff |  0xafffe |  0xaffff |  0xbfffe |  0xbffff
211        | 0xcfffe |  0xcffff |  0xdfffe |  0xdffff |  0xefffe |  0xeffff
212        | 0xffffe |  0xfffff | 0x10fffe |  0x10ffff)
213}
214
215
216fn replace_charref(s: &Captures) -> String {
217    let s = &s[1];
218    if let Some(stripped) = s.strip_prefix('#') {
219        // numeric charref
220        let num = match s.chars().nth(1) {
221            Some('x') | Some('X') => u32::from_str_radix(s[2..].trim_end_matches(';'), 16),
222            _ => u32::from_str(stripped.trim_end_matches(';'))
223        }.expect("Must be convertible to int");
224
225        if let Some(v) = INVALID_CHARREFS.get(&num) {
226            v.to_string()
227        } else if (0xD800..=0xDFFF).contains(&num) || num > 0x10FFFF {
228            "\u{FFFD}".to_string()
229        } else if is_invalid_codepoint(num) {
230            "".to_string()
231        } else {
232            char::from_u32(num).expect("A valid char").to_string()
233        }
234    } else {
235        // named charref
236        if let Some(v) = HTML5.get(s) {
237            v.to_string()
238        } else {
239            // find the longest matching name (as defined by the standard)
240            if s.len() > 1 {
241                let mut x = s.len() - 1;
242                while x > 1 {
243                    if let Some(m) = HTML5.get(&s[..x]) {
244                        return m.to_string() + &s[x..];
245                    }
246                    x -= 1;
247                }
248            }
249            "&".to_string() + s
250        }
251    }
252}
253
254/// A full equivalent of `html.unescape` from Python. Transforms a string
255/// by replacing "escaped" HTML characters (such as `&gt;`) into their original
256/// form (character `>` in this instance).
257pub(crate) fn unescape(s: &str) -> Cow<str> {
258    if !s.contains('&') {
259        Cow::Borrowed(s)
260    } else {
261        lazy_static! {
262            static ref CHARREF: Regex = fregex!(
263                concat!(r"&(#[0-9]+;?",
264                        r"|#[xX][0-9a-fA-F]+;?",
265                        r"|[^\t\n\f <&#;]{1,32};?)"));
266        }
267        CHARREF.replace_all(s, replace_charref)
268    }
269}
270
271
272pub(crate) fn has_raw_text(text: &str) -> bool {
273    const PHRASING_CONTENT: &str = concat!(
274        "abbr|acronym|area|audio|a|bdo|br|button|b|canvas|cite|code|command|",
275        "data|datalist|del|dfn|em|embed|iframe|img|input|ins|i|kbd|keygen|",
276        "label|link|map|mark|math|meta|meter|noscript|object|output|progress|",
277        "q|ruby|samp|script|select|small|span|strong|sub|sup|svg|textarea|",
278        "time|var|video|wbr",
279    );
280    lazy_static! {
281        static ref UNWRAPPABLE_RE: Regex = fregex!(
282            &format!(r"(?si)</?(?:{0})(?:\s[^<>]*?|/?)>", BLOCK_CONTENT));
283        static ref WRAPPED_RE: Regex = fregex!(
284            r"(?si)^</?([^\s<>/]+)[^<>]*?>(?:.*</\1\s*?>)?$");
285        static ref PHRASING_RE: Regex = fregex!(
286            &format!(r"(?i)^(?:{0})$", PHRASING_CONTENT));
287    }
288
289
290    if UNWRAPPABLE_RE.is_match(text).unwrap_or_default()
291            || DIVIDER_RE.is_match(text).unwrap_or_default() {
292        false
293    } else if let Some(m) = WRAPPED_RE.captures(text).unwrap_or_default() {
294        PHRASING_RE.is_match(&m[1]).unwrap_or_default()
295    } else {
296        true
297    }
298}
299
300
301#[cfg(test)]
302mod tests {
303    use super::{quoteattr, unescape, encode_html, has_raw_text};
304
305    #[test]
306    fn test_quoteattr() {
307        assert_eq!(
308            quoteattr("So called \"escaped\"\nmulti-line <value>"),
309            "'So called \"escaped\"&#13;multi-line &lt;value&gt;'");
310    }
311
312    #[test]
313    fn test_unescape() {
314        let original = r#"<a href="http://example.com">Some&nbsp;link</a>"#;
315        let escaped = encode_html(original, true, false);
316        assert_eq!(escaped, "&lt;a href=&quot;http://example.com&quot;&gt;Some&amp;nbsp;link&lt;/a&gt;");
317        let unescaped = unescape(&escaped);
318        assert_eq!(unescaped, original);
319    }
320
321    #[test]
322    fn test_has_raw_text() {
323        assert!(!has_raw_text("<p>foo bar biz baz</p>"));
324        assert!(has_raw_text(" why yes, yes it does"));
325    }
326}