1use once_cell::sync::Lazy;
22use regex::Regex;
23use std::borrow::Cow;
24use wikidot_normalize::normalize;
25
26#[cfg(feature = "html")]
27use crate::tree::LinkLocation;
28
29pub const URL_SCHEMES: [&str; 19] = [
30 "blob:",
31 "chrome-extension://",
32 "chrome://",
33 "content://",
34 "dns:",
35 "feed:",
36 "file://",
37 "ftp://",
38 "git://",
39 "gopher://",
40 "http://",
41 "https://",
42 "irc6://",
43 "irc://",
44 "ircs://",
45 "mailto:",
46 "resource://",
47 "rtmp://",
48 "sftp://",
49];
50
51pub fn is_url(url: &str) -> bool {
52 for scheme in &URL_SCHEMES {
54 if url.starts_with(scheme) {
55 return true;
56 }
57 }
58
59 false
60}
61
62pub fn dangerous_scheme(url: &str) -> bool {
69 static SCHEME_REGEX: Lazy<Regex> = Lazy::new(|| Regex::new(r"^[\w\-]+$").unwrap());
70
71 url.split_once(':')
72 .map(|(scheme, _)| {
73 if !SCHEME_REGEX.is_match(scheme) {
74 return true;
76 }
77
78 scheme.eq_ignore_ascii_case("javascript")
81 || scheme.eq_ignore_ascii_case("data")
82 })
83 .unwrap_or(false)
84}
85
86#[cfg(feature = "html")]
87pub fn normalize_link<'a>(
88 link: &'a LinkLocation<'a>,
89 helper: &dyn BuildSiteUrl,
90) -> Cow<'a, str> {
91 match link {
92 LinkLocation::Url(url) => normalize_href(url),
93 LinkLocation::Page(page_ref) => {
94 let (site, page) = page_ref.fields();
95
96 match site {
97 Some(site) => Cow::Owned(helper.build_url(site, page)),
98 None => normalize_href(page),
99 }
100 }
101 }
102}
103
104pub fn normalize_href(url: &str) -> Cow<str> {
105 if is_url(url) || url.starts_with('#') || url == "javascript:;" {
106 Cow::Borrowed(url)
107 } else if dangerous_scheme(url) {
108 warn!("Attempt to pass in dangerous URL: {url}");
109 Cow::Borrowed("#invalid-url")
110 } else {
111 let split_anchor: Vec<&str> = url.splitn(2, "#").collect();
112 let mut split_url: Vec<&str> = split_anchor[0].split("/").collect();
113 if !split_url[0].is_empty() || (split_url[0].is_empty() && split_url.len() == 1) {
114 split_url.insert(0, "");
115 }
116 let mut url = str!(split_url[1]);
117 normalize(&mut url);
118 split_url[1] = &url;
119 url = split_url.join("/");
120 if split_anchor.len() == 2 {
121 url = format!("{}#{}", url, split_anchor[1]);
122 }
123 Cow::Owned(url)
124 }
125}
126
127pub trait BuildSiteUrl {
128 fn build_url(&self, site: &str, path: &str) -> String;
129}
130
131#[test]
132fn detect_dangerous_schemes() {
133 macro_rules! check {
134 ($input:expr, $result:expr $(,)?) => {
135 assert_eq!(
136 dangerous_scheme($input),
137 $result,
138 "For input {:?}, dangerous scheme detection failed",
139 $input,
140 )
141 };
142 }
143
144 check!("http://example.com/", false);
145 check!("https://example.com/", false);
146 check!("irc://irc.scpwiki.com", false);
147 check!("javascript:alert(1)", true);
148 check!("JAVASCRIPT:alert(1)", true);
149 check!(" javascript:alert(1)", true);
150 check!("java\nscript:alert(1)", true);
151 check!("javascript\t:alert(1)", true);
152 check!("wtf$1:foo", true);
153 check!("JaVaScRiPt:alert(document.cookie)", true);
154 check!("data:text/plain;base64,SGVsbG8sIFdvcmxkIQ==", true);
155 check!("data:text/javascript,alert(1)", true);
156 check!("data:text/html,<script>alert('XSS');</script>", true);
157 check!("DATA:text/html,<script>alert('XSS');</script>", true);
158}