1use std::borrow::Cow;
2use std::collections::HashMap;
3use std::str::FromStr;
4
5use lazy_static::lazy_static;
6use fancy_regex::{Regex, Captures};
7
8use crate::html::HTML5;
9use crate::regextra::fregex;
10use crate::regex_snips::{BLOCK_CONTENT, DIVIDER_RE};
11
12pub(crate) fn encode_html(text: &str, quotes: bool, line_spacers: bool) -> String {
13 let mut result = String::with_capacity(2 * text.len());
14 let pattern = if quotes {
15 if line_spacers {
16 &['&', '<', '>', '"', '\'', '\n', '\r', '\t'][..]
17 } else {
18 &['&', '<', '>', '"', '\''][..]
19 }
20 } else if line_spacers {
21 &['&', '<', '>', '\n', '\r', '\t'][..]
22 } else {
23 &['&', '<', '>'][..]
24 };
25 let mut leftover = text;
26 while let Some(sep_index) = leftover.find(pattern) {
27 result.push_str(&leftover[0..sep_index]);
28 let sep = &leftover[sep_index..sep_index+1];
29 let replacement = match sep {
30 "&" => "&",
31 "<" => "<",
32 ">" => ">",
33 "\"" => """,
34 "'" => "'",
35 "\n" => " ",
36 "\r" => " ",
37 "\t" => "	",
38 _ => unreachable!("An impossible symbol to encode: {}", sep)
39 };
40 result.push_str(replacement);
41 leftover = &leftover[sep_index + 1..];
42 }
43 result.push_str(leftover);
44 result
45}
46
47pub(crate) fn reverse_encode_html(text: &str) -> Cow<str> {
48 lazy_static! {
49 static ref ENTITY_RE: Regex = fregex!(
50 "(&(?:amp|lt|gt|quot|#39|#13|#10|#9);)");
51 }
52 ENTITY_RE.replace_all(text, |cap: &Captures| {
53 let entity = &cap[1];
54 match entity {
55 "<" => "<",
56 ">" => ">",
57 """ => "\"",
58 "'" => "'",
59 " " => "\n",
60 " " => "\r",
61 "	" => "\t",
62 _ => unreachable!("Entity {entity:#?} must be part of the regular expression")
63 }
64 })
65}
66
67pub(crate) fn quoteattr(data: &str) -> String {
70 let data = encode_html(data, false, true);
71 if data.contains('"') {
72 if data.contains('\'') {
73 format!("\"{}\"", data.replace('"', """))
74 } else {
75 format!("'{}'", data)
76 }
77 } else {
78 format!("\"{}\"", data)
79 }
80}
81
82fn is_valid_attribute_char(c: char) -> bool {
84 !(c.is_control()
85 || c.is_whitespace()
86 || ('\u{FDD0}'..='\u{FDEF}').contains(&c)
87 || c == '='
88 || c == '/'
89 || c == '>'
90 || c == '"'
91 || c == '\'')
92}
93
94
95pub(crate) fn join_html_attributes(result: &mut String, attributes: &[(String, String)]) {
96 let valid_attrs = attributes.iter().filter(|(name, _)| name.chars().all(is_valid_attribute_char));
97 for (aname, avalue) in valid_attrs {
98 result.push(' ');
99 result.push_str(aname);
100 result.push('=');
101 result.push_str("eattr(avalue));
102 }
103}
104
105pub(crate) trait AsOptionStr {
106 fn as_option_str(&self) -> Option<&str>;
107}
108
109impl AsOptionStr for &Option<String> {
110 fn as_option_str(&self) -> Option<&str> {
111 self.as_deref()
112 }
113}
114
115impl AsOptionStr for &str {
116 fn as_option_str(&self) -> Option<&str> {
117 Some(*self)
118 }
119}
120
121impl AsOptionStr for &String {
122 fn as_option_str(&self) -> Option<&str> {
123 Some(self.as_str())
124 }
125}
126
127pub(crate) fn generate_tag<S>(
132 tag: S, content: Option<&str>, attributes: &[(String, String)]
133) -> String
134 where S: AsOptionStr
135{
136 if let Some(tag) = tag.as_option_str() {
137 if tag.is_empty() {
138 return content.unwrap_or_default().to_owned();
139 }
140 if !tag.chars().all(char::is_alphanumeric) {
141 return encode_html(content.unwrap_or_default(), true, false);
142 }
143
144 let mut result = String::from("<") + tag;
145 join_html_attributes(&mut result, attributes);
146 match content {
147 Some(text) => {
148 result.push('>');
149 result.push_str(text);
150 result.push_str("</");
151 result.push_str(tag);
152 result.push('>');
153 },
154 None => {
155 result.push_str(" />");
156 },
157 }
158 result
159 } else {
160 content.unwrap_or_default().to_owned()
161 }
162}
163
164lazy_static! {
165 static ref INVALID_CHARREFS: HashMap<u32, char> = HashMap::from([
166 (0x00, '\u{fffd}'), (0x0d, '\r'), (0x80, '\u{20ac}'), (0x81, '\u{81}'), (0x82, '\u{201a}'), (0x83, '\u{0192}'), (0x84, '\u{201e}'), (0x85, '\u{2026}'), (0x86, '\u{2020}'), (0x87, '\u{2021}'), (0x88, '\u{02c6}'), (0x89, '\u{2030}'), (0x8a, '\u{0160}'), (0x8b, '\u{2039}'), (0x8c, '\u{0152}'), (0x8d, '\u{8d}'), (0x8e, '\u{017d}'), (0x8f, '\u{8f}'), (0x90, '\u{90}'), (0x91, '\u{2018}'), (0x92, '\u{2019}'), (0x93, '\u{201c}'), (0x94, '\u{201d}'), (0x95, '\u{2022}'), (0x96, '\u{2013}'), (0x97, '\u{2014}'), (0x98, '\u{02dc}'), (0x99, '\u{2122}'), (0x9a, '\u{0161}'), (0x9b, '\u{203a}'), (0x9c, '\u{0153}'), (0x9d, '\u{9d}'), (0x9e, '\u{017e}'), (0x9f, '\u{0178}'), ]);
201
202}
203
204fn is_invalid_codepoint(cp: u32) -> bool {
205 matches!(cp,
206 0x0001..=0x0008 | 0x000E..=0x001F | 0x007F..=0x009F | 0xFDD0..=0xFDEF
207 | 0xb | 0xfffe | 0xffff | 0x1fffe | 0x1ffff | 0x2fffe | 0x2ffff
208 | 0x3fffe | 0x3ffff | 0x4fffe | 0x4ffff | 0x5fffe | 0x5ffff
209 | 0x6fffe | 0x6ffff | 0x7fffe | 0x7ffff | 0x8fffe | 0x8ffff
210 | 0x9fffe | 0x9ffff | 0xafffe | 0xaffff | 0xbfffe | 0xbffff
211 | 0xcfffe | 0xcffff | 0xdfffe | 0xdffff | 0xefffe | 0xeffff
212 | 0xffffe | 0xfffff | 0x10fffe | 0x10ffff)
213}
214
215
216fn replace_charref(s: &Captures) -> String {
217 let s = &s[1];
218 if let Some(stripped) = s.strip_prefix('#') {
219 let num = match s.chars().nth(1) {
221 Some('x') | Some('X') => u32::from_str_radix(s[2..].trim_end_matches(';'), 16),
222 _ => u32::from_str(stripped.trim_end_matches(';'))
223 }.expect("Must be convertible to int");
224
225 if let Some(v) = INVALID_CHARREFS.get(&num) {
226 v.to_string()
227 } else if (0xD800..=0xDFFF).contains(&num) || num > 0x10FFFF {
228 "\u{FFFD}".to_string()
229 } else if is_invalid_codepoint(num) {
230 "".to_string()
231 } else {
232 char::from_u32(num).expect("A valid char").to_string()
233 }
234 } else {
235 if let Some(v) = HTML5.get(s) {
237 v.to_string()
238 } else {
239 if s.len() > 1 {
241 let mut x = s.len() - 1;
242 while x > 1 {
243 if let Some(m) = HTML5.get(&s[..x]) {
244 return m.to_string() + &s[x..];
245 }
246 x -= 1;
247 }
248 }
249 "&".to_string() + s
250 }
251 }
252}
253
254pub(crate) fn unescape(s: &str) -> Cow<str> {
258 if !s.contains('&') {
259 Cow::Borrowed(s)
260 } else {
261 lazy_static! {
262 static ref CHARREF: Regex = fregex!(
263 concat!(r"&(#[0-9]+;?",
264 r"|#[xX][0-9a-fA-F]+;?",
265 r"|[^\t\n\f <&#;]{1,32};?)"));
266 }
267 CHARREF.replace_all(s, replace_charref)
268 }
269}
270
271
272pub(crate) fn has_raw_text(text: &str) -> bool {
273 const PHRASING_CONTENT: &str = concat!(
274 "abbr|acronym|area|audio|a|bdo|br|button|b|canvas|cite|code|command|",
275 "data|datalist|del|dfn|em|embed|iframe|img|input|ins|i|kbd|keygen|",
276 "label|link|map|mark|math|meta|meter|noscript|object|output|progress|",
277 "q|ruby|samp|script|select|small|span|strong|sub|sup|svg|textarea|",
278 "time|var|video|wbr",
279 );
280 lazy_static! {
281 static ref UNWRAPPABLE_RE: Regex = fregex!(
282 &format!(r"(?si)</?(?:{0})(?:\s[^<>]*?|/?)>", BLOCK_CONTENT));
283 static ref WRAPPED_RE: Regex = fregex!(
284 r"(?si)^</?([^\s<>/]+)[^<>]*?>(?:.*</\1\s*?>)?$");
285 static ref PHRASING_RE: Regex = fregex!(
286 &format!(r"(?i)^(?:{0})$", PHRASING_CONTENT));
287 }
288
289
290 if UNWRAPPABLE_RE.is_match(text).unwrap_or_default()
291 || DIVIDER_RE.is_match(text).unwrap_or_default() {
292 false
293 } else if let Some(m) = WRAPPED_RE.captures(text).unwrap_or_default() {
294 PHRASING_RE.is_match(&m[1]).unwrap_or_default()
295 } else {
296 true
297 }
298}
299
300
301#[cfg(test)]
302mod tests {
303 use super::{quoteattr, unescape, encode_html, has_raw_text};
304
305 #[test]
306 fn test_quoteattr() {
307 assert_eq!(
308 quoteattr("So called \"escaped\"\nmulti-line <value>"),
309 "'So called \"escaped\" multi-line <value>'");
310 }
311
312 #[test]
313 fn test_unescape() {
314 let original = r#"<a href="http://example.com">Some link</a>"#;
315 let escaped = encode_html(original, true, false);
316 assert_eq!(escaped, "<a href="http://example.com">Some&nbsp;link</a>");
317 let unescaped = unescape(&escaped);
318 assert_eq!(unescaped, original);
319 }
320
321 #[test]
322 fn test_has_raw_text() {
323 assert!(!has_raw_text("<p>foo bar biz baz</p>"));
324 assert!(has_raw_text(" why yes, yes it does"));
325 }
326}