ftml/
url.rs

1/*
2 * url.rs
3 *
4 * ftml - Library to parse Wikidot text
5 * Copyright (C) 2019-2025 Wikijump Team
6 *
7 * This program is free software: you can redistribute it and/or modify
8 * it under the terms of the GNU Affero General Public License as published by
9 * the Free Software Foundation, either version 3 of the License, or
10 * (at your option) any later version.
11 *
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU Affero General Public License for more details.
16 *
17 * You should have received a copy of the GNU Affero General Public License
18 * along with this program. If not, see <http://www.gnu.org/licenses/>.
19 */
20
21use once_cell::sync::Lazy;
22use regex::Regex;
23use std::borrow::Cow;
24use wikidot_normalize::normalize;
25
26#[cfg(feature = "html")]
27use crate::tree::LinkLocation;
28
29pub const URL_SCHEMES: [&str; 19] = [
30    "blob:",
31    "chrome-extension://",
32    "chrome://",
33    "content://",
34    "dns:",
35    "feed:",
36    "file://",
37    "ftp://",
38    "git://",
39    "gopher://",
40    "http://",
41    "https://",
42    "irc6://",
43    "irc://",
44    "ircs://",
45    "mailto:",
46    "resource://",
47    "rtmp://",
48    "sftp://",
49];
50
51pub fn is_url(url: &str) -> bool {
52    // If it's a URL
53    for scheme in &URL_SCHEMES {
54        if url.starts_with(scheme) {
55            return true;
56        }
57    }
58
59    false
60}
61
62/// Returns true if the scheme for this URL is `javascript:` or `data:`.
63/// This function works case-insensitively (for ASCII).
64///
65/// Additionally, there is a check to make sure that there isn't any
66/// funny business going on with the scheme, such as insertion of
67/// whitespace. In such cases, the URL is rejected.
68pub fn dangerous_scheme(url: &str) -> bool {
69    static SCHEME_REGEX: Lazy<Regex> = Lazy::new(|| Regex::new(r"^[\w\-]+$").unwrap());
70
71    url.split_once(':')
72        .map(|(scheme, _)| {
73            if !SCHEME_REGEX.is_match(scheme) {
74                // Weird scheme like "java\nscript", reject.
75                return true;
76            }
77
78            // Now that we've confirmed it's normal,
79            // check for these specific dangerous schemes.
80            scheme.eq_ignore_ascii_case("javascript")
81                || scheme.eq_ignore_ascii_case("data")
82        })
83        .unwrap_or(false)
84}
85
86#[cfg(feature = "html")]
87pub fn normalize_link<'a>(
88    link: &'a LinkLocation<'a>,
89    helper: &dyn BuildSiteUrl,
90) -> Cow<'a, str> {
91    match link {
92        LinkLocation::Url(url) => normalize_href(url),
93        LinkLocation::Page(page_ref) => {
94            let (site, page) = page_ref.fields();
95
96            match site {
97                Some(site) => Cow::Owned(helper.build_url(site, page)),
98                None => normalize_href(page),
99            }
100        }
101    }
102}
103
104pub fn normalize_href(url: &str) -> Cow<str> {
105    if is_url(url) || url.starts_with('#') || url == "javascript:;" {
106        Cow::Borrowed(url)
107    } else if dangerous_scheme(url) {
108        warn!("Attempt to pass in dangerous URL: {url}");
109        Cow::Borrowed("#invalid-url")
110    } else {
111        let split_anchor: Vec<&str> = url.splitn(2, "#").collect();
112        let mut split_url: Vec<&str> = split_anchor[0].split("/").collect();
113        if !split_url[0].is_empty() || (split_url[0].is_empty() && split_url.len() == 1) {
114            split_url.insert(0, "");
115        }
116        let mut url = str!(split_url[1]);
117        normalize(&mut url);
118        split_url[1] = &url;
119        url = split_url.join("/");
120        if split_anchor.len() == 2 {
121            url = format!("{}#{}", url, split_anchor[1]);
122        }
123        Cow::Owned(url)
124    }
125}
126
127pub trait BuildSiteUrl {
128    fn build_url(&self, site: &str, path: &str) -> String;
129}
130
131#[test]
132fn detect_dangerous_schemes() {
133    macro_rules! check {
134        ($input:expr, $result:expr $(,)?) => {
135            assert_eq!(
136                dangerous_scheme($input),
137                $result,
138                "For input {:?}, dangerous scheme detection failed",
139                $input,
140            )
141        };
142    }
143
144    check!("http://example.com/", false);
145    check!("https://example.com/", false);
146    check!("irc://irc.scpwiki.com", false);
147    check!("javascript:alert(1)", true);
148    check!("JAVASCRIPT:alert(1)", true);
149    check!(" javascript:alert(1)", true);
150    check!("java\nscript:alert(1)", true);
151    check!("javascript\t:alert(1)", true);
152    check!("wtf$1:foo", true);
153    check!("JaVaScRiPt:alert(document.cookie)", true);
154    check!("data:text/plain;base64,SGVsbG8sIFdvcmxkIQ==", true);
155    check!("data:text/javascript,alert(1)", true);
156    check!("data:text/html,<script>alert('XSS');</script>", true);
157    check!("DATA:text/html,<script>alert('XSS');</script>", true);
158}