Skip to main content

ftml/
url.rs

1/*
2 * url.rs
3 *
4 * ftml - Library to parse Wikidot text
5 * Copyright (C) 2019-2026 Wikijump Team
6 *
7 * This program is free software: you can redistribute it and/or modify
8 * it under the terms of the GNU Affero General Public License as published by
9 * the Free Software Foundation, either version 3 of the License, or
10 * (at your option) any later version.
11 *
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU Affero General Public License for more details.
16 *
17 * You should have received a copy of the GNU Affero General Public License
18 * along with this program. If not, see <http://www.gnu.org/licenses/>.
19 */
20
21use regex::Regex;
22use std::borrow::Cow;
23use std::sync::LazyLock;
24
25#[cfg(feature = "html")]
26use crate::tree::LinkLocation;
27
28pub const URL_SCHEMES: [&str; 19] = [
29    "blob:",
30    "chrome-extension://",
31    "chrome://",
32    "content://",
33    "dns:",
34    "feed:",
35    "file://",
36    "ftp://",
37    "git://",
38    "gopher://",
39    "http://",
40    "https://",
41    "irc6://",
42    "irc://",
43    "ircs://",
44    "mailto:",
45    "resource://",
46    "rtmp://",
47    "sftp://",
48];
49
50pub fn is_url(url: &str) -> bool {
51    // If it's a URL
52    for scheme in &URL_SCHEMES {
53        if url.starts_with(scheme) {
54            return true;
55        }
56    }
57
58    false
59}
60
61/// Returns true if the scheme for this URL is `javascript:` or `data:`.
62/// This function works case-insensitively (for ASCII).
63///
64/// Additionally, there is a check to make sure that there isn't any
65/// funny business going on with the scheme, such as insertion of
66/// whitespace. In such cases, the URL is rejected.
67///
68/// This function does not check anything starting with `/`, since
69/// this would be a relative link.
70pub fn dangerous_scheme(url: &str) -> bool {
71    static SCHEME_REGEX: LazyLock<Regex> =
72        LazyLock::new(|| Regex::new(r"^[\w\-]+$").unwrap());
73
74    // Ignore relative links
75    if url.starts_with('/') {
76        return false;
77    }
78
79    // Get the scheme from the URL
80    url.split_once(':')
81        .map(|(scheme, _)| {
82            if !SCHEME_REGEX.is_match(scheme) {
83                // Weird scheme like "java\nscript", reject.
84                return true;
85            }
86
87            // Now that we've confirmed it's normal,
88            // check for these specific dangerous schemes.
89            scheme.eq_ignore_ascii_case("javascript")
90                || scheme.eq_ignore_ascii_case("data")
91        })
92        .unwrap_or(false)
93}
94
95#[cfg(feature = "html")]
96pub fn normalize_link<'a>(
97    link: &'a LinkLocation<'a>,
98    helper: &dyn BuildSiteUrl,
99) -> Cow<'a, str> {
100    match link {
101        LinkLocation::Url(url) => normalize_href(url, None),
102        LinkLocation::Page(page_ref) => {
103            let (site, page, extra) = page_ref.fields();
104            match site {
105                Some(site) => Cow::Owned(helper.build_url(site, page, extra)),
106                None => normalize_href(page, extra),
107            }
108        }
109    }
110}
111
112/// Normalize a URL string.
113///
114/// This performs a few operations:
115/// * Blocking dangerous URLs (e.g. `javascript:alert(1)`)
116/// * For relative links, normalizing the page portion (e.g. `/SCP-001/edit`)
117/// * Adds a leading `/` if it is missing.
118///
119/// The `extra` argument corresponds to `PageRef.extra`.
120/// It shouldn't be `Some(_)` for other kinds of links.
121pub fn normalize_href<'a>(url: &'a str, extra: Option<&'a str>) -> Cow<'a, str> {
122    if url == "javascript:;" {
123        trace!("Leaving no-op link as-is");
124        Cow::Borrowed(url)
125    } else if is_url(url) || url.starts_with('/') || url.starts_with('#') {
126        match extra {
127            Some(extra) => {
128                trace!("Leaving safe URL with extra as-is: {url}{extra}");
129                Cow::Owned(format!("{url}{extra}"))
130            }
131            None => {
132                trace!("Leaving safe URL as-is: {url}");
133                Cow::Borrowed(url)
134            }
135        }
136    } else if dangerous_scheme(url) {
137        warn!("Attempt to pass in dangerous URL: {url}");
138        Cow::Borrowed("#invalid-url")
139    } else {
140        // In this branch, the URL is not absolute (e.g. https://example.com)
141        // and so must be a relative link with no leading / (e.g. just "some-page").
142        let extra = extra.unwrap_or("");
143        trace!("Adding leading slash to URL: {url}{extra}");
144        Cow::Owned(format!("/{url}{extra}"))
145    }
146}
147
148pub trait BuildSiteUrl {
149    fn build_url(&self, site: &str, path: &str, extra: Option<&str>) -> String;
150}
151
152#[test]
153fn detect_dangerous_schemes() {
154    macro_rules! test {
155        ($input:expr, $result:expr $(,)?) => {
156            assert_eq!(
157                dangerous_scheme($input),
158                $result,
159                "For input {:?}, dangerous scheme detection failed",
160                $input,
161            )
162        };
163    }
164
165    test!("http://example.com/", false);
166    test!("https://example.com/", false);
167    test!("irc://irc.scpwiki.com", false);
168    test!("javascript:alert(1)", true);
169    test!("JAVASCRIPT:alert(1)", true);
170    test!(" javascript:alert(1)", true);
171    test!("java\nscript:alert(1)", true);
172    test!("javascript\t:alert(1)", true);
173    test!("wtf$1:foo", true);
174    test!("JaVaScRiPt:alert(document.cookie)", true);
175    test!("data:text/plain;base64,SGVsbG8sIFdvcmxkIQ==", true);
176    test!("data:text/javascript,alert(1)", true);
177    test!("data:text/html,<script>alert('XSS');</script>", true);
178    test!("DATA:text/html,<script>alert('XSS');</script>", true);
179    test!("/page", false);
180    test!("/page#target", false);
181    test!("/page/edit", false);
182    test!("/page/edit#target", false);
183    test!("/category:page", false);
184    test!("/category:page#target", false);
185    test!("/category:page/edit", false);
186    test!("/category:page/edit#target", false);
187}
188
189#[test]
190fn test_normalize_href() {
191    macro_rules! test {
192        ($input:expr => $expected:expr $(,)?) => {{
193            let actual = normalize_href($input, None);
194            assert_eq!(
195                actual.as_ref(),
196                $expected,
197                "For input {:?}, normalize_href() doesn't match expected",
198                $input,
199            );
200        }};
201
202        ($url_input:expr, $extra_input:expr => $expected:expr $(,)?) => {{
203            let actual = normalize_href($url_input, Some($extra_input));
204            assert_eq!(
205                actual.as_ref(),
206                $expected,
207                "For input {:?} / {:?}, normalize_href() doesn't match expected",
208                $url_input,
209                $extra_input,
210            );
211        }};
212
213        // For when the input is the same as the output
214        ($input:expr) => {
215            test!($input => $input)
216        };
217    }
218
219    // Basic targets
220    test!("#");
221    test!("#target");
222    test!("#edit-area");
223    test!("javascript:;");
224    test!("http://example.net");
225    test!("https://example.net");
226    test!("irc://irc.scpwiki.com");
227    test!("sftp://ftp.example.com/upload");
228
229    // Dangerous
230    test!("javascript:alert(1)" => "#invalid-url");
231    test!(
232        "data:text/html,<script>alert('XSS')</script>" => "#invalid-url",
233    );
234
235    // Preserve page links
236    test!("/page");
237    test!("/page", "#target" => "/page#target");
238    test!("/page", "/edit" => "/page/edit");
239    test!("page", "/edit#target" => "/page/edit#target");
240    test!("/category:page");
241    test!("/category:page", "#target" => "/category:page#target");
242    test!("/category:page", "/edit" => "/category:page/edit");
243    test!("/category:page", "/edit#target" => "/category:page/edit#target");
244
245    // Missing / prefix
246    test!("some-page" => "/some-page");
247    test!("some-page#target" => "/some-page#target");
248    test!("system:some-page" => "/system:some-page");
249    test!("system:some-page#target" => "/system:some-page#target");
250}