use markup5ever_rcdom::Node;
use markup5ever_rcdom::NodeData;
use markup5ever_rcdom::RcDom;
use html5ever::tendril::TendrilSink;
use html5ever::tree_builder::TreeBuilderOpts;
use html5ever::{parse_document, ParseOpts};
use lazy_static::lazy_static;
use linkify::{LinkFinder, LinkKind};
use maplit::{hashmap, hashset};
use regex::Regex;
const AMP: &str = "(&)";
const DOMAIN: &str = "[^\\s,)(\"]+";
const HASH: &str = "(#[\\w._-]+)?";
pub mod block;
pub fn markup(s: &str) -> String {
let sanitized_html = ammonia::Builder::new().link_rel(None).clean(s).to_string();
markup_from_raw(&sanitized_html)
}
pub fn markup_from_raw(s: &str) -> String {
lazy_static! {
static ref PARAM: String = format!("({amp}?\\w+(=[\\w._-]+)?)", amp = AMP);
static ref PARAMS: String = format!("(\\?{param}*)*", param = *PARAM);
static ref REURL: String = format!(
"(https?://{domain}{params}{hash})",
domain = DOMAIN,
params = *PARAMS,
hash = HASH
);
static ref RE: Regex = Regex::new(&REURL).unwrap();
static ref MATCH: Regex = Regex::new(
r"<p>|</p>|<br>|<b>|</b>|<strong>|</strong>|<code>|</code>|<i>|</i>|<em>|</em>|<|>| "
).unwrap();
}
let s = s.trim();
let mut previous_end = 0;
let mut foo = Vec::with_capacity(s.as_bytes().len());
for mat in MATCH.find_iter(s) {
foo.push(&s[previous_end..mat.start()]);
foo.push(match mat.as_str() {
"<p>" | "</p>" => "",
"<br>" => "\n",
"<b>" | "<strong>" => "<b>",
"</b>" | "</strong>" => "</b>",
"<code>" => "<tt>",
"</code>" => "</tt>",
"<i>" | "<em>" => "<i>",
"</i>" | "</em>" => "</i>",
"<" => "<",
">" => ">",
" " => " ",
_ => unreachable!(),
});
previous_end = mat.end();
}
foo.push(&s[previous_end..]);
let mut out = String::with_capacity(foo.iter().map(|s| s.len()).sum());
out.extend(foo.into_iter());
String::from(RE.replace_all(&out.trim(), "<a href=\"$0\">$0</a>"))
}
pub fn matrix_html_to_markup(s: &str) -> String {
#[cfg_attr(rustfmt, rustfmt_skip)]
let allowed_tags = hashset![
"font",
"del",
"h1", "h2", "h3", "h4", "h5", "h6", "blockquote", "p", "a", "ul", "ol", "sup", "sub",
"nl", "li", "b", "i", "u", "strong", "em", "strike", "code", "hr", "br", "div",
"table", "thead", "caption", "tbody", "tr", "th", "td", "pre", "span", "img",
];
let allowed_attributes = hashmap![
"font" => hashset!["color", "data-mx-bg-color", "data-mx-color", "style"],
"span" => hashset!["data-mx-bg-color", "data-mx-color", "style"],
"a" => hashset!["href", "name", "target", "rel"],
"img" => hashset!["src", "width", "height", "alt", "title"],
"ol" => hashset!["start"],
"code" => hashset!["class"],
];
let allowed_urls = hashset!["http", "https", "ftp", "mailto", "magnet"];
let sanitized_html = ammonia::Builder::new()
.url_schemes(allowed_urls)
.tags(allowed_tags)
.tag_attributes(allowed_attributes)
.link_rel(None)
.clean(s)
.to_string();
markup_html(&sanitized_html)
.map(|x| x.trim_matches('\n').to_string())
.unwrap_or_else(|_| markup_from_raw(&sanitized_html))
}
pub fn html_escape(s: &str) -> String {
s.to_string()
.replace('&', "&")
.replace('<', "<")
.replace('>', ">")
.replace('"', """)
}
pub fn markup_links(s: &str) -> String {
let mut parsed = String::with_capacity(s.len());
let finder = LinkFinder::new();
let mut prepend_str: Option<String> = None;
for span in finder.spans(s) {
let mut s = span.as_str().to_string();
match span.kind() {
Some(&LinkKind::Url) => {
if s.ends_with("&") {
prepend_str = Some("&".to_string());
let t = s.len() - 4;
s.truncate(t);
}
if s.ends_with("<") {
prepend_str = Some("<".to_string());
let t = s.len() - 3;
s.truncate(t);
}
if s.ends_with(">") {
prepend_str = Some(">".to_string());
let t = s.len() - 3;
s.truncate(t);
}
if s.ends_with(""") {
prepend_str = Some(""".to_string());
let t = s.len() - 5;
s.truncate(t);
}
if s.ends_with(""") {
prepend_str = Some(""".to_string() + &prepend_str.unwrap_or_default());
let t = s.len() - 6;
s.truncate(t);
}
parsed.push_str(&format!("<a href=\"{0}\">{0}</a>", s))
}
_ => {
if let Some(s) = prepend_str {
parsed.push_str(&s);
prepend_str = None;
}
parsed.push_str(&s);
}
};
}
parsed
}
fn convert_tag<'a>(t: &'a str) -> Option<(&'a str, &'a str)> {
let allowed = [
"a",
"br",
"em",
"i",
"p",
"code",
"strong",
"b",
"blockquote",
"li",
];
if !allowed.contains(&t) {
return Some(("", ""));
}
match t {
"em" | "i" => Some(("<i>", "</i>")),
"blockquote" => Some(("<i>", "</i>")),
"p" => Some(("\n", "\n")),
"br" => Some(("\n", "")),
"code" => Some(("<tt>", "</tt>")),
"strong" | "b" => Some(("<b>", "</b>")),
"li" => Some(("", "\n")),
_ => None,
}
}
fn match_tag(node: &Node, tags: &[&str]) -> bool {
match node.data {
NodeData::Element { name: ref n, .. } => {
let node_tag = n.local.to_string();
tags.contains(&node_tag.as_str())
}
_ => false,
}
}
fn convert_node(node: &Node, autolinks: bool) -> String {
let mut output = String::new();
match node.data {
NodeData::Text { contents: ref c } => {
let escaped = &html_escape(&c.borrow().replace("\n", ""));
if autolinks {
output.push_str(&markup_links(escaped));
} else {
output.push_str(&escaped);
}
}
NodeData::Element {
name: ref n,
attrs: ref a,
..
} => {
let mut content = String::new();
let tag = n.local.to_string();
match &tag[..] {
"ul" => {
content.push_str("\n");
for child in node.children.borrow().iter() {
if match_tag(child, &["li", "ul", "ol"]) {
content.push_str(&format!(" • {}", &convert_node(child, true)));
}
}
}
"ol" => {
let mut counter = 1;
content.push_str("\n");
for child in node.children.borrow().iter() {
if match_tag(child, &["li", "ul", "ol"]) {
content.push_str(&format!(
" {}. {}",
counter.to_string(),
&convert_node(child, true)
));
counter += 1;
}
}
}
_ => {
for child in node.children.borrow().iter() {
content.push_str(&convert_node(child, true));
}
}
}
match &tag[..] {
"body" => {
output.push_str(&content);
}
"a" => {
let mut link = "".to_string();
for attr in a.borrow().iter() {
let s = attr.name.local.to_string();
match &s[..] {
"href" => {
link = attr.value.to_string();
}
_ => {}
}
}
let mut no_link_content = String::new();
for child in node.children.borrow().iter() {
no_link_content.push_str(&convert_node(child, false));
}
output.push_str(&format!("<a href=\"{}\">{}</a>", html_escape(&link), no_link_content));
}
"font" => {
let mut color = "".to_string();
for attr in a.borrow().iter() {
let s = attr.name.local.to_string();
match &s[..] {
"color" => {
color = attr.value.to_string();
}
_ => {}
}
}
output.push_str(&format!(
"<span foreground=\"{}\">{1}</span>",
color, content
));
}
_ => {
if let Some((t1, t2)) = convert_tag(&tag) {
output.push_str(&format!("{}{}{}", t1, content, t2));
} else {
output.push_str(&format!("<{0}>{1}</{0}>", tag, content));
}
}
};
}
_ => {}
}
output
}
pub fn markup_html(s: &str) -> Result<String, anyhow::Error> {
let opts = ParseOpts {
tree_builder: TreeBuilderOpts {
drop_doctype: true,
..Default::default()
},
..Default::default()
};
let dom = parse_document(RcDom::default(), opts)
.from_utf8()
.read_from(&mut s.as_bytes())?;
let document = &dom.document;
let html = &document.children.borrow()[0];
let body = &html.children.borrow()[1];
Ok(convert_node(body, true))
}
#[cfg(test)]
mod test {
use super::*;
use pretty_assertions::assert_eq;
#[test]
fn test_markup() {
let m = markup("this is parsed");
assert_eq!(&m, "this is parsed");
let m = markup("this is <span>parsed</span>");
assert_eq!(&m, "this is <span>parsed</span>");
let m = markup("this is &ssdf;");
assert_eq!(&m, "this is &ssdf;");
let m = markup("<p>this <br>is &ssdf;</p>");
assert_eq!(&m, "this \nis &ssdf;");
let m = markup("<b>this <i>is &ssd<f;</i></b>");
assert_eq!(&m, "<b>this <i>is &ssd</i></b>");
let url = "http://url.com/test?foo1&foo2=test&foo3#hashing";
let m = markup(&format!("this is &ssdf; {}", url));
assert_eq!(
&m,
&format!(
"this is &ssdf; <a href=\"{0}\">{0}</a>",
url.replace('&', "&")
)
);
for l in &[
("with links: http://gnome.org :D", "http://gnome.org"),
(
"with links: http://url.com/test.html&stuff :D",
"http://url.com/test.html&stuff",
),
] {
let m = markup(l.0);
assert_eq!(
&m,
&format!(
"with links: <a href=\"{0}\">{0}</a> :D",
l.1.replace('&', "&")
)
);
}
}
#[test]
fn test_matrix() {
let markup = matrix_html_to_markup;
let m = markup("this is parsed");
assert_eq!(&m, "this is parsed");
let m = markup("this is <span>parsed</span>");
assert_eq!(&m, "this is parsed");
let m = markup("this is &ssdf;");
assert_eq!(&m, "this is &ssdf;");
let m = markup("<p>this <br>is &ssdf;</p>");
assert_eq!(&m, "this \nis &ssdf;");
let m = markup("<b>this <i>is &ssd<f;</i></b>");
assert_eq!(&m, "<b>this <i>is &ssd</i></b>");
let m = markup("hello <font color=\"#112233\">world</font>");
assert_eq!(&m, "hello <span foreground=\"#112233\">world</span>");
let m = markup("hello <em><font color=\"#112233\">http://gnome.org</font></em>");
assert_eq!(&m, "hello <i><span foreground=\"#112233\"><a href=\"http://gnome.org\">http://gnome.org</a></span></i>");
let url = "http://url.com/test?foo1&foo2=test&foo3#hashing";
let m = markup(&format!("this is &ssdf; {}", url));
assert_eq!(
&m,
&format!(
"this is &ssdf; <a href=\"{0}\">{0}</a>",
url.replace('&', "&")
)
);
for l in &[
("with links: http://gnome.org :D", "http://gnome.org"),
(
"with links: http://url.com/test.html&stuff :D",
"http://url.com/test.html&stuff",
),
] {
let m = markup(l.0);
assert_eq!(
&m,
&format!(
"with links: <a href=\"{0}\">{0}</a> :D",
l.1.replace('&', "&")
)
);
}
}
#[test]
fn test_links() {
let strings = [
("clean string without markup",
"clean string without markup"),
("clean string with a <b>markup</b>",
"clean string with a <b>markup</b>"),
("clean string with a <b>markup</b> and link http://gnome.org/?p=1&q#hash",
"clean string with a <b>markup</b> and link <a href=\"http://gnome.org/?p=1&q#hash\">http://gnome.org/?p=1&q#hash</a>"),
("report-bug is: please report bugs with parabola packages on the packaging bug tracker at: https://labs.parabola.nu/projects/issue-tracker/issues?set_filter=1&tracker_id=1",
"report-bug is: please report bugs with parabola packages on the packaging bug tracker at: <a href=\"https://labs.parabola.nu/projects/issue-tracker/issues?set_filter=1&tracker_id=1\">https://labs.parabola.nu/projects/issue-tracker/issues?set_filter=1&tracker_id=1</a>"),
(
"bill-auger, isacdaavid: there are two major issues I see with gnome-software. The first issue is that flathub, the largest repo for flatpaks, has nonfree software. If flathub isn't included by default, I think this is fine. The second is archlinux-appstream-data. The [PKGBUILD](https://git.archlinux.org/svntogit/packages.git/tree/trunk/PKGBUILD?h=packages/archlinux-appstream-data) does not use appstream-generator at all. However, it does require grabbing files from sources.archlinux.org",
"bill-auger, isacdaavid: there are two major issues I see with gnome-software. The first issue is that flathub, the largest repo for flatpaks, has nonfree software. If flathub isn't included by default, I think this is fine. The second is archlinux-appstream-data. The [PKGBUILD](<a href=\"https://git.archlinux.org/svntogit/packages.git/tree/trunk/PKGBUILD?h=packages/archlinux-appstream-data\">https://git.archlinux.org/svntogit/packages.git/tree/trunk/PKGBUILD?h=packages/archlinux-appstream-data</a>) does not use appstream-generator at all. However, it does require grabbing files from sources.archlinux.org",
),
("links with problems: http://gnome.org/?p=1&",
"links with problems: <a href=\"http://gnome.org/?p=1\">http://gnome.org/?p=1</a>&"),
("links with problems: http://gnome.org/?p=1>",
"links with problems: <a href=\"http://gnome.org/?p=1\">http://gnome.org/?p=1</a>>"),
("links with problems: http://gnome.org/?p=1<",
"links with problems: <a href=\"http://gnome.org/?p=1\">http://gnome.org/?p=1</a><"),
];
for &(s, e) in strings.iter() {
let m = markup_links(&html_escape(s));
assert_eq!(&m, e);
}
}
#[test]
fn test_markup_links() {
let strings = [
("This is a test message with <em>markdown</em><br /><a href=\"http://gnome.org\">gnome</a><br />and other link http://gnome.org",
"This is a test message with <i>markdown</i>\n<a href=\"http://gnome.org\">gnome</a>\nand other link <a href=\"http://gnome.org\">http://gnome.org</a>"),
];
for &(s, e) in strings.iter() {
let m = markup_html(s).unwrap();
assert_eq!(&m, e);
}
}
#[test]
fn test_ending_quote_link() {
let strings = [
("<boxes:gnome-boxes xmlns:boxes=\"https://wiki.gnome.org/Apps/Boxes\">",
"<boxes:gnome-boxes xmlns:boxes="<a href=\"https://wiki.gnome.org/Apps/Boxes\">https://wiki.gnome.org/Apps/Boxes</a>">"),
];
for &(s, e) in strings.iter() {
let m = markup_links(&html_escape(s));
assert_eq!(&m, e);
}
}
#[test]
fn test_link_scape() {
let strings = [
("<a href=\"https://forums.transbian.love/?page=thread&id=69\">https://forums.transbian.love/?page=thread&id=69</a>",
"<a href=\"https://forums.transbian.love/?page=thread&id=69\">https://forums.transbian.love/?page=thread&id=69</a>"),
("<a href=\"https://forums.transbian.love/?page=thread&id=69\">https://forums.transbian.love/?page=thread&id=69</a>",
"<a href=\"https://forums.transbian.love/?page=thread&id=69\">https://forums.transbian.love/?page=thread&id=69</a>"),
("https://forums.transbian.love/?page=thread&id=69",
"<a href=\"https://forums.transbian.love/?page=thread&id=69\">https://forums.transbian.love/?page=thread&id=69</a>"),
];
for &(s, e) in strings.iter() {
let m = markup_html(s).unwrap();
assert_eq!(&m, e);
}
}
}