use std::borrow::Cow;
use std::collections::HashMap;
use std::str::FromStr;
use lazy_static::lazy_static;
use fancy_regex::{Regex, Captures};
use crate::html::HTML5;
use crate::regextra::fregex;
use crate::regex_snips::{BLOCK_CONTENT, DIVIDER_RE};
pub(crate) fn encode_html(text: &str, quotes: bool, line_spacers: bool) -> String {
let mut result = String::with_capacity(2 * text.len());
let pattern = if quotes {
if line_spacers {
&['&', '<', '>', '"', '\'', '\n', '\r', '\t'][..]
} else {
&['&', '<', '>', '"', '\''][..]
}
} else if line_spacers {
&['&', '<', '>', '\n', '\r', '\t'][..]
} else {
&['&', '<', '>'][..]
};
let mut leftover = text;
while let Some(sep_index) = leftover.find(pattern) {
result.push_str(&leftover[0..sep_index]);
let sep = &leftover[sep_index..sep_index+1];
let replacement = match sep {
"&" => "&",
"<" => "<",
">" => ">",
"\"" => """,
"'" => "'",
"\n" => " ",
"\r" => " ",
"\t" => "	",
_ => unreachable!("An impossible symbol to encode: {}", sep)
};
result.push_str(replacement);
leftover = &leftover[sep_index + 1..];
}
result.push_str(leftover);
result
}
pub(crate) fn reverse_encode_html(text: &str) -> Cow<str> {
lazy_static! {
static ref ENTITY_RE: Regex = fregex!(
"(&(?:amp|lt|gt|quot|#39|#13|#10|#9);)");
}
ENTITY_RE.replace_all(text, |cap: &Captures| {
let entity = &cap[1];
match entity {
"<" => "<",
">" => ">",
""" => "\"",
"'" => "'",
" " => "\n",
" " => "\r",
"	" => "\t",
_ => unreachable!("Entity {entity:#?} must be part of the regular expression")
}
})
}
pub(crate) fn quoteattr(data: &str) -> String {
let data = encode_html(data, false, true);
if data.contains('"') {
if data.contains('\'') {
format!("\"{}\"", data.replace('"', """))
} else {
format!("'{}'", data)
}
} else {
format!("\"{}\"", data)
}
}
fn is_valid_attribute_char(c: char) -> bool {
!(c.is_control()
|| c.is_whitespace()
|| ('\u{FDD0}'..='\u{FDEF}').contains(&c)
|| c == '='
|| c == '/'
|| c == '>'
|| c == '"'
|| c == '\'')
}
pub(crate) fn join_html_attributes(result: &mut String, attributes: &[(String, String)]) {
let valid_attrs = attributes.iter().filter(|(name, _)| name.chars().all(is_valid_attribute_char));
for (aname, avalue) in valid_attrs {
result.push(' ');
result.push_str(aname);
result.push('=');
result.push_str("eattr(avalue));
}
}
pub(crate) trait AsOptionStr {
fn as_option_str(&self) -> Option<&str>;
}
impl AsOptionStr for &Option<String> {
fn as_option_str(&self) -> Option<&str> {
self.as_deref()
}
}
impl AsOptionStr for &str {
fn as_option_str(&self) -> Option<&str> {
Some(*self)
}
}
impl AsOptionStr for &String {
fn as_option_str(&self) -> Option<&str> {
Some(self.as_str())
}
}
pub(crate) fn generate_tag<S>(
tag: S, content: Option<&str>, attributes: &[(String, String)]
) -> String
where S: AsOptionStr
{
if let Some(tag) = tag.as_option_str() {
if tag.is_empty() {
return content.unwrap_or_default().to_owned();
}
if !tag.chars().all(char::is_alphanumeric) {
return encode_html(content.unwrap_or_default(), true, false);
}
let mut result = String::from("<") + tag;
join_html_attributes(&mut result, attributes);
match content {
Some(text) => {
result.push('>');
result.push_str(text);
result.push_str("</");
result.push_str(tag);
result.push('>');
},
None => {
result.push_str(" />");
},
}
result
} else {
content.unwrap_or_default().to_owned()
}
}
lazy_static! {
static ref INVALID_CHARREFS: HashMap<u32, char> = HashMap::from([
(0x00, '\u{fffd}'), (0x0d, '\r'), (0x80, '\u{20ac}'), (0x81, '\u{81}'), (0x82, '\u{201a}'), (0x83, '\u{0192}'), (0x84, '\u{201e}'), (0x85, '\u{2026}'), (0x86, '\u{2020}'), (0x87, '\u{2021}'), (0x88, '\u{02c6}'), (0x89, '\u{2030}'), (0x8a, '\u{0160}'), (0x8b, '\u{2039}'), (0x8c, '\u{0152}'), (0x8d, '\u{8d}'), (0x8e, '\u{017d}'), (0x8f, '\u{8f}'), (0x90, '\u{90}'), (0x91, '\u{2018}'), (0x92, '\u{2019}'), (0x93, '\u{201c}'), (0x94, '\u{201d}'), (0x95, '\u{2022}'), (0x96, '\u{2013}'), (0x97, '\u{2014}'), (0x98, '\u{02dc}'), (0x99, '\u{2122}'), (0x9a, '\u{0161}'), (0x9b, '\u{203a}'), (0x9c, '\u{0153}'), (0x9d, '\u{9d}'), (0x9e, '\u{017e}'), (0x9f, '\u{0178}'), ]);
}
fn is_invalid_codepoint(cp: u32) -> bool {
matches!(cp,
0x0001..=0x0008 | 0x000E..=0x001F | 0x007F..=0x009F | 0xFDD0..=0xFDEF
| 0xb | 0xfffe | 0xffff | 0x1fffe | 0x1ffff | 0x2fffe | 0x2ffff
| 0x3fffe | 0x3ffff | 0x4fffe | 0x4ffff | 0x5fffe | 0x5ffff
| 0x6fffe | 0x6ffff | 0x7fffe | 0x7ffff | 0x8fffe | 0x8ffff
| 0x9fffe | 0x9ffff | 0xafffe | 0xaffff | 0xbfffe | 0xbffff
| 0xcfffe | 0xcffff | 0xdfffe | 0xdffff | 0xefffe | 0xeffff
| 0xffffe | 0xfffff | 0x10fffe | 0x10ffff)
}
fn replace_charref(s: &Captures) -> String {
let s = &s[1];
if let Some(stripped) = s.strip_prefix('#') {
let num = match s.chars().nth(1) {
Some('x') | Some('X') => u32::from_str_radix(s[2..].trim_end_matches(';'), 16),
_ => u32::from_str(stripped.trim_end_matches(';'))
}.expect("Must be convertible to int");
if let Some(v) = INVALID_CHARREFS.get(&num) {
v.to_string()
} else if (0xD800..=0xDFFF).contains(&num) || num > 0x10FFFF {
"\u{FFFD}".to_string()
} else if is_invalid_codepoint(num) {
"".to_string()
} else {
char::from_u32(num).expect("A valid char").to_string()
}
} else {
if let Some(v) = HTML5.get(s) {
v.to_string()
} else {
if s.len() > 1 {
let mut x = s.len() - 1;
while x > 1 {
if let Some(m) = HTML5.get(&s[..x]) {
return m.to_string() + &s[x..];
}
x -= 1;
}
}
"&".to_string() + s
}
}
}
pub(crate) fn unescape(s: &str) -> Cow<str> {
if !s.contains('&') {
Cow::Borrowed(s)
} else {
lazy_static! {
static ref CHARREF: Regex = fregex!(
concat!(r"&(#[0-9]+;?",
r"|#[xX][0-9a-fA-F]+;?",
r"|[^\t\n\f <&#;]{1,32};?)"));
}
CHARREF.replace_all(s, replace_charref)
}
}
pub(crate) fn has_raw_text(text: &str) -> bool {
const PHRASING_CONTENT: &str = concat!(
"abbr|acronym|area|audio|a|bdo|br|button|b|canvas|cite|code|command|",
"data|datalist|del|dfn|em|embed|iframe|img|input|ins|i|kbd|keygen|",
"label|link|map|mark|math|meta|meter|noscript|object|output|progress|",
"q|ruby|samp|script|select|small|span|strong|sub|sup|svg|textarea|",
"time|var|video|wbr",
);
lazy_static! {
static ref UNWRAPPABLE_RE: Regex = fregex!(
&format!(r"(?si)</?(?:{0})(?:\s[^<>]*?|/?)>", BLOCK_CONTENT));
static ref WRAPPED_RE: Regex = fregex!(
r"(?si)^</?([^\s<>/]+)[^<>]*?>(?:.*</\1\s*?>)?$");
static ref PHRASING_RE: Regex = fregex!(
&format!(r"(?i)^(?:{0})$", PHRASING_CONTENT));
}
if UNWRAPPABLE_RE.is_match(text).unwrap_or_default()
|| DIVIDER_RE.is_match(text).unwrap_or_default() {
false
} else if let Some(m) = WRAPPED_RE.captures(text).unwrap_or_default() {
PHRASING_RE.is_match(&m[1]).unwrap_or_default()
} else {
true
}
}
#[cfg(test)]
mod tests {
use super::{quoteattr, unescape, encode_html, has_raw_text};
#[test]
fn test_quoteattr() {
assert_eq!(
quoteattr("So called \"escaped\"\nmulti-line <value>"),
"'So called \"escaped\" multi-line <value>'");
}
#[test]
fn test_unescape() {
let original = r#"<a href="http://example.com">Some link</a>"#;
let escaped = encode_html(original, true, false);
assert_eq!(escaped, "<a href="http://example.com">Some&nbsp;link</a>");
let unescaped = unescape(&escaped);
assert_eq!(unescaped, original);
}
#[test]
fn test_has_raw_text() {
assert!(!has_raw_text("<p>foo bar biz baz</p>"));
assert!(has_raw_text(" why yes, yes it does"));
}
}