use once_cell::sync::Lazy;
use regex::Regex;
use std::borrow::Cow;
use wikidot_normalize::normalize;
#[cfg(feature = "html")]
use crate::tree::LinkLocation;
pub const URL_SCHEMES: [&str; 19] = [
"blob:",
"chrome-extension://",
"chrome://",
"content://",
"dns:",
"feed:",
"file://",
"ftp://",
"git://",
"gopher://",
"http://",
"https://",
"irc6://",
"irc://",
"ircs://",
"mailto:",
"resource://",
"rtmp://",
"sftp://",
];
pub fn is_url(url: &str) -> bool {
for scheme in &URL_SCHEMES {
if url.starts_with(scheme) {
return true;
}
}
false
}
pub fn dangerous_scheme(url: &str) -> bool {
static SCHEME_REGEX: Lazy<Regex> = Lazy::new(|| Regex::new(r"^[\w\-]+$").unwrap());
url.split_once(':')
.map(|(scheme, _)| {
if !SCHEME_REGEX.is_match(scheme) {
return true;
}
scheme.eq_ignore_ascii_case("javascript")
|| scheme.eq_ignore_ascii_case("data")
})
.unwrap_or(false)
}
#[cfg(feature = "html")]
pub fn normalize_link<'a>(
link: &'a LinkLocation<'a>,
helper: &dyn BuildSiteUrl,
) -> Cow<'a, str> {
match link {
LinkLocation::Url(url) => normalize_href(url),
LinkLocation::Page(page_ref) => {
let (site, page) = page_ref.fields();
match site {
Some(site) => Cow::Owned(helper.build_url(site, page)),
None => normalize_href(page),
}
}
}
}
pub fn normalize_href(url: &str) -> Cow<str> {
if is_url(url) || url.starts_with('#') || url == "javascript:;" {
Cow::Borrowed(url)
} else if dangerous_scheme(url) {
warn!("Attempt to pass in dangerous URL: {url}");
Cow::Borrowed("#invalid-url")
} else {
let split_anchor: Vec<&str> = url.splitn(2, "#").collect();
let mut split_url: Vec<&str> = split_anchor[0].split("/").collect();
if !split_url[0].is_empty() || (split_url[0].is_empty() && split_url.len() == 1) {
split_url.insert(0, "");
}
let mut url = str!(split_url[1]);
normalize(&mut url);
split_url[1] = &url;
url = split_url.join("/");
if split_anchor.len() == 2 {
url = format!("{}#{}", url, split_anchor[1]);
}
Cow::Owned(url)
}
}
pub trait BuildSiteUrl {
fn build_url(&self, site: &str, path: &str) -> String;
}
#[test]
fn detect_dangerous_schemes() {
macro_rules! check {
($input:expr, $result:expr $(,)?) => {
assert_eq!(
dangerous_scheme($input),
$result,
"For input {:?}, dangerous scheme detection failed",
$input,
)
};
}
check!("http://example.com/", false);
check!("https://example.com/", false);
check!("irc://irc.scpwiki.com", false);
check!("javascript:alert(1)", true);
check!("JAVASCRIPT:alert(1)", true);
check!(" javascript:alert(1)", true);
check!("java\nscript:alert(1)", true);
check!("javascript\t:alert(1)", true);
check!("wtf$1:foo", true);
check!("JaVaScRiPt:alert(document.cookie)", true);
check!("data:text/plain;base64,SGVsbG8sIFdvcmxkIQ==", true);
check!("data:text/javascript,alert(1)", true);
check!("data:text/html,<script>alert('XSS');</script>", true);
check!("DATA:text/html,<script>alert('XSS');</script>", true);
}