Skip to main content

index_core/
url.rs

1//! URL semantic types.
2
3use std::fmt::{Display, Formatter};
4
5use ::url::Url;
6
7/// Supported URL schemes known to Index.
8#[derive(Debug, Clone, PartialEq, Eq)]
9pub enum Scheme {
10    /// HTTP URL.
11    Http,
12    /// HTTPS URL.
13    Https,
14    /// Local file URL or explicit local file path use.
15    File,
16    /// A syntactically valid scheme that is not first-class yet.
17    Other(String),
18}
19
20impl Scheme {
21    /// Parses a scheme string.
22    #[must_use]
23    pub fn parse(input: &str) -> Self {
24        match input.to_ascii_lowercase().as_str() {
25            "http" => Self::Http,
26            "https" => Self::Https,
27            "file" => Self::File,
28            other => Self::Other(other.to_owned()),
29        }
30    }
31
32    /// Returns whether the scheme is allowed by the initial security policy.
33    #[must_use]
34    pub const fn is_initially_allowed(&self) -> bool {
35        matches!(self, Self::Http | Self::Https | Self::File)
36    }
37}
38
39/// URL parse/validation errors.
40#[derive(Debug, Clone, PartialEq, Eq)]
41pub enum UrlError {
42    /// The input was empty.
43    Empty,
44    /// The input contained ASCII whitespace.
45    ContainsWhitespace,
46    /// The input did not contain a URL scheme separator.
47    MissingScheme,
48    /// The scheme is not allowed by the current policy.
49    DisallowedScheme(String),
50    /// The URL parser rejected the input.
51    Invalid(String),
52    /// HTTP and HTTPS URLs require a host.
53    MissingHost,
54}
55
56impl Display for UrlError {
57    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
58        match self {
59            Self::Empty => f.write_str("URL is empty"),
60            Self::ContainsWhitespace => f.write_str("URL contains whitespace"),
61            Self::MissingScheme => f.write_str("URL is missing a scheme"),
62            Self::DisallowedScheme(scheme) => write!(f, "URL scheme is not allowed: {scheme}"),
63            Self::Invalid(reason) => write!(f, "URL is invalid: {reason}"),
64            Self::MissingHost => f.write_str("HTTP URL is missing a host"),
65        }
66    }
67}
68
69impl std::error::Error for UrlError {}
70
71/// A validated URL accepted by Index.
72#[derive(Debug, Clone, PartialEq, Eq, Hash)]
73pub struct IndexUrl {
74    raw: String,
75    scheme: String,
76}
77
78impl IndexUrl {
79    /// Parses and validates a URL according to the initial Index policy.
80    ///
81    pub fn parse(input: impl AsRef<str>) -> Result<Self, UrlError> {
82        let trimmed = input.as_ref().trim();
83        if trimmed.is_empty() {
84            return Err(UrlError::Empty);
85        }
86        if trimmed.chars().any(char::is_whitespace) {
87            return Err(UrlError::ContainsWhitespace);
88        }
89
90        let Some((scheme, rest)) = trimmed.split_once(':') else {
91            return Err(UrlError::MissingScheme);
92        };
93
94        let parsed_scheme = Scheme::parse(scheme);
95        if !parsed_scheme.is_initially_allowed() {
96            return Err(UrlError::DisallowedScheme(scheme.to_owned()));
97        }
98        if matches!(parsed_scheme, Scheme::Http | Scheme::Https)
99            && (!rest.starts_with("//") || rest.starts_with("///"))
100        {
101            return Err(UrlError::MissingHost);
102        }
103
104        let mut parsed =
105            Url::parse(trimmed).map_err(|error| UrlError::Invalid(error.to_string()))?;
106        if matches!(parsed_scheme, Scheme::Http | Scheme::Https) && parsed.host_str().is_none() {
107            return Err(UrlError::MissingHost);
108        }
109        parsed.set_fragment(None);
110
111        Ok(Self {
112            raw: parsed.to_string(),
113            scheme: parsed.scheme().to_owned(),
114        })
115    }
116
117    /// Returns the original normalized string.
118    #[must_use]
119    pub fn as_str(&self) -> &str {
120        &self.raw
121    }
122
123    /// Returns the lowercased scheme.
124    #[must_use]
125    pub fn scheme(&self) -> &str {
126        &self.scheme
127    }
128
129    /// Returns the origin used for per-site state when one can be derived.
130    #[must_use]
131    pub fn origin(&self) -> Option<Origin> {
132        Origin::from_url(self)
133    }
134
135    /// Returns a deterministic filesystem-safe cache key.
136    #[must_use]
137    pub fn cache_key(&self) -> String {
138        let mut key = String::with_capacity(self.raw.len());
139        let mut previous_was_separator = false;
140        for ch in self.raw.chars() {
141            if ch.is_ascii_alphanumeric() {
142                key.push(ch.to_ascii_lowercase());
143                previous_was_separator = false;
144            } else if !previous_was_separator {
145                key.push('_');
146                previous_was_separator = true;
147            }
148        }
149        let trimmed = key.trim_matches('_');
150        if trimmed.is_empty() {
151            "url".to_owned()
152        } else {
153            trimmed.to_owned()
154        }
155    }
156}
157
158impl Display for IndexUrl {
159    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
160        f.write_str(self.as_str())
161    }
162}
163
164/// A normalized URL origin used for per-site state.
165#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)]
166pub struct Origin(String);
167
168impl Origin {
169    /// Builds an origin from a normalized Index URL.
170    #[must_use]
171    pub fn from_url(url: &IndexUrl) -> Option<Self> {
172        let parsed = Url::parse(url.as_str()).ok()?;
173        match parsed.scheme() {
174            "http" | "https" => {
175                let host = parsed.host_str()?;
176                let port = parsed
177                    .port()
178                    .map(|port| format!(":{port}"))
179                    .unwrap_or_default();
180                Some(Self(format!("{}://{}{}", parsed.scheme(), host, port)))
181            }
182            "file" => Some(Self("file://".to_owned())),
183            _ => None,
184        }
185    }
186
187    /// Parses a stored origin string.
188    #[must_use]
189    pub fn from_stored(input: impl Into<String>) -> Self {
190        Self(input.into())
191    }
192
193    /// Returns the normalized origin string.
194    #[must_use]
195    pub fn as_str(&self) -> &str {
196        &self.0
197    }
198}
199
200impl Display for Origin {
201    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
202        f.write_str(self.as_str())
203    }
204}
205
206#[cfg(test)]
207mod tests {
208    use super::{IndexUrl, Origin, Scheme, UrlError};
209
210    #[test]
211    fn parses_https_url() {
212        let url = IndexUrl::parse("https://example.com/docs");
213        assert!(url.is_ok());
214        assert_eq!(url.map(|u| u.scheme().to_owned()), Ok("https".to_owned()));
215    }
216
217    #[test]
218    fn rejects_empty_url() {
219        assert_eq!(IndexUrl::parse(""), Err(UrlError::Empty));
220    }
221
222    #[test]
223    fn rejects_whitespace() {
224        assert_eq!(
225            IndexUrl::parse("https://example.com/a b"),
226            Err(UrlError::ContainsWhitespace)
227        );
228    }
229
230    #[test]
231    fn rejects_disallowed_scheme() {
232        assert_eq!(
233            IndexUrl::parse("javascript:alert(1)"),
234            Err(UrlError::DisallowedScheme("javascript".to_owned()))
235        );
236    }
237
238    #[test]
239    fn rejects_missing_scheme() {
240        assert_eq!(
241            IndexUrl::parse("example.com/path"),
242            Err(UrlError::MissingScheme)
243        );
244    }
245
246    #[test]
247    fn supports_file_scheme_and_display_roundtrip() {
248        let parsed = IndexUrl::parse("file:///tmp/example.txt");
249        assert!(parsed.is_ok());
250        if let Ok(url) = parsed {
251            assert_eq!(url.scheme(), "file");
252            assert_eq!(url.to_string(), "file:///tmp/example.txt");
253            assert_eq!(url.as_str(), "file:///tmp/example.txt");
254        }
255    }
256
257    #[test]
258    fn scheme_parser_distinguishes_known_and_other_values() {
259        assert_eq!(Scheme::parse("HTTP"), Scheme::Http);
260        assert_eq!(Scheme::parse("https"), Scheme::Https);
261        assert_eq!(Scheme::parse("file"), Scheme::File);
262        assert_eq!(Scheme::parse("mailto"), Scheme::Other("mailto".to_owned()));
263    }
264
265    #[test]
266    fn only_initial_allowlist_schemes_are_marked_allowed() {
267        assert!(Scheme::Http.is_initially_allowed());
268        assert!(Scheme::Https.is_initially_allowed());
269        assert!(Scheme::File.is_initially_allowed());
270        assert!(!Scheme::Other("ssh".to_owned()).is_initially_allowed());
271    }
272
273    #[test]
274    fn normalizes_scheme_host_default_port_and_fragment() {
275        let url = IndexUrl::parse("HTTP://EXAMPLE.COM:80/docs#part");
276        assert_eq!(
277            url.map(|url| url.to_string()),
278            Ok("http://example.com/docs".to_owned())
279        );
280    }
281
282    #[test]
283    fn derives_http_origin() -> Result<(), Box<dyn std::error::Error>> {
284        let url = IndexUrl::parse("https://example.com:8443/docs")?;
285        assert_eq!(
286            url.origin(),
287            Some(Origin::from_stored("https://example.com:8443"))
288        );
289        Ok(())
290    }
291
292    #[test]
293    fn cache_keys_are_normalized_and_fragment_independent() -> Result<(), Box<dyn std::error::Error>>
294    {
295        let first = IndexUrl::parse("https://EXAMPLE.com:443/docs?q=1#one")?;
296        let second = IndexUrl::parse("https://example.com/docs?q=1#two")?;
297
298        assert_eq!(first.as_str(), second.as_str());
299        assert_eq!(first.cache_key(), second.cache_key());
300        assert!(!first.cache_key().contains('/'));
301        assert!(!first.cache_key().contains('?'));
302        Ok(())
303    }
304
305    #[test]
306    fn rejects_http_urls_without_hosts() {
307        assert_eq!(IndexUrl::parse("https:///docs"), Err(UrlError::MissingHost));
308    }
309
310    #[test]
311    fn rejects_parser_invalid_urls() {
312        assert!(matches!(
313            IndexUrl::parse("http://[::1"),
314            Err(UrlError::Invalid(_))
315        ));
316    }
317
318    #[test]
319    fn derives_file_origin_and_displays_stored_origin() -> Result<(), Box<dyn std::error::Error>> {
320        let url = IndexUrl::parse("file:///tmp/index.html")?;
321        let origin = Origin::from_url(&url);
322
323        assert_eq!(origin, Some(Origin::from_stored("file://")));
324        assert_eq!(Origin::from_stored("file://").to_string(), "file://");
325        Ok(())
326    }
327
328    #[test]
329    fn cache_key_has_fallback_for_non_alphanumeric_urls() -> Result<(), Box<dyn std::error::Error>>
330    {
331        let url = IndexUrl::parse("file:///")?;
332        assert_eq!(url.cache_key(), "file");
333        Ok(())
334    }
335}