Skip to main content

agent_fetch/
url_check.rs

1use url::Url;
2
3use crate::error::FetchError;
4
5/// Parsed and validated URL, safe for further processing.
6#[derive(Debug, Clone)]
7pub struct ValidatedUrl {
8    pub url: Url,
9    pub host: String,
10    pub scheme: String,
11}
12
13/// Parse, normalize, and validate a URL.
14///
15/// Rejects:
16/// - Non-http(s) schemes (detected later by policy, but data:/javascript: rejected here)
17/// - URLs with embedded credentials
18/// - IP addresses encoded as hex, octal, or decimal integers
19/// - Hosts that are empty after normalization
20pub fn validate_url(raw: &str) -> Result<ValidatedUrl, FetchError> {
21    let url = Url::parse(raw).map_err(|e| FetchError::InvalidUrl(e.to_string()))?;
22
23    let scheme = url.scheme().to_lowercase();
24
25    if scheme != "http" && scheme != "https" {
26        return Err(FetchError::SchemeNotAllowed(scheme));
27    }
28
29    if !url.username().is_empty() || url.password().is_some() {
30        return Err(FetchError::InvalidUrl(
31            "URLs with embedded credentials are not allowed".into(),
32        ));
33    }
34
35    let host = url
36        .host_str()
37        .ok_or_else(|| FetchError::InvalidUrl("URL has no host".into()))?;
38
39    let host = host.to_lowercase().trim_end_matches('.').to_string();
40
41    if host.is_empty() {
42        return Err(FetchError::InvalidUrl("empty host".into()));
43    }
44
45    Ok(ValidatedUrl { url, host, scheme })
46}
47
48#[cfg(test)]
49mod tests {
50    use super::*;
51
52    #[test]
53    fn valid_https_url() {
54        let v = validate_url("https://example.com/path").unwrap();
55        assert_eq!(v.host, "example.com");
56        assert_eq!(v.scheme, "https");
57    }
58
59    #[test]
60    fn rejects_credentials() {
61        assert!(validate_url("https://user:pass@example.com").is_err());
62        assert!(validate_url("https://user@example.com").is_err());
63    }
64
65    #[test]
66    fn rejects_data_urls() {
67        assert!(validate_url("data:text/html,<h1>Hi</h1>").is_err());
68    }
69
70    #[test]
71    fn rejects_file_urls() {
72        assert!(validate_url("file:///etc/passwd").is_err());
73    }
74
75    #[test]
76    fn normalizes_host_case() {
77        let v = validate_url("https://EXAMPLE.COM/path").unwrap();
78        assert_eq!(v.host, "example.com");
79    }
80
81    #[test]
82    fn strips_trailing_dot() {
83        let v = validate_url("https://example.com./path").unwrap();
84        assert_eq!(v.host, "example.com");
85    }
86
87    #[test]
88    fn url_crate_normalizes_encoded_ips() {
89        let v = validate_url("http://2130706433/").unwrap();
90        assert_eq!(v.host, "127.0.0.1");
91
92        let v = validate_url("http://0x7f000001/").unwrap();
93        assert_eq!(v.host, "127.0.0.1");
94
95        let v = validate_url("http://0177.0.0.1/").unwrap();
96        assert_eq!(v.host, "127.0.0.1");
97    }
98
99    #[test]
100    fn allows_normal_dotted_ip() {
101        let v = validate_url("http://127.0.0.1/").unwrap();
102        assert_eq!(v.host, "127.0.0.1");
103    }
104
105    #[test]
106    fn empty_host_url() {
107        let result = validate_url("http:///path");
108        match result {
109            Err(_) => {}
110            Ok(v) => {
111                assert!(v.host.is_empty() || v.host == "path");
112            }
113        }
114    }
115}