Skip to main content

specter/url/
mod.rs

1//! RFC 3986 URL helper backed by [`http::Uri`].
2
3use std::fmt;
4use std::net::{Ipv4Addr, Ipv6Addr};
5use std::str::FromStr;
6
7use http::Uri;
8
9/// URL parse failure.
10#[derive(Debug, Clone, PartialEq, Eq)]
11pub struct ParseError {
12    message: String,
13}
14
15impl ParseError {
16    fn new(message: impl Into<String>) -> Self {
17        Self {
18            message: message.into(),
19        }
20    }
21}
22
23impl fmt::Display for ParseError {
24    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
25        f.write_str(&self.message)
26    }
27}
28
29impl std::error::Error for ParseError {}
30
31/// Parsed authority host.
32#[derive(Debug, Clone, PartialEq, Eq)]
33pub enum Host {
34    Domain(String),
35    Ipv4(Ipv4Addr),
36    Ipv6(Ipv6Addr),
37}
38
39impl fmt::Display for Host {
40    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
41        match self {
42            Host::Domain(domain) => f.write_str(domain),
43            Host::Ipv4(addr) => write!(f, "{addr}"),
44            Host::Ipv6(addr) => write!(f, "{addr}"),
45        }
46    }
47}
48
49/// Absolute URL with normalized string storage.
50#[derive(Clone, PartialEq, Eq)]
51pub struct Url {
52    inner: String,
53    uri: Uri,
54    host_str: Option<String>,
55}
56
57impl fmt::Debug for Url {
58    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
59        f.debug_tuple("Url").field(&self.inner).finish()
60    }
61}
62
63impl fmt::Display for Url {
64    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
65        f.write_str(self.as_str())
66    }
67}
68
69impl Url {
70    pub fn parse(input: &str) -> Result<Self, ParseError> {
71        let without_fragment = input.split('#').next().unwrap_or(input);
72        if let Some(authority) = extract_authority(without_fragment) {
73            validate_authority(authority)?;
74        }
75        let uri = Uri::try_from(without_fragment)
76            .map_err(|err| ParseError::new(format!("invalid URI: {err}")))?;
77        if uri.scheme().is_none() {
78            return Err(ParseError::new("relative URL without a base"));
79        }
80        let host_str = uri
81            .authority()
82            .and_then(|authority| host_str_from_authority(authority.as_str()).ok());
83        Ok(Self {
84            inner: without_fragment.to_string(),
85            uri,
86            host_str,
87        })
88    }
89
90    #[inline]
91    pub fn as_str(&self) -> &str {
92        &self.inner
93    }
94
95    #[inline]
96    pub fn scheme(&self) -> &str {
97        self.uri.scheme_str().unwrap_or("")
98    }
99
100    #[inline]
101    pub fn path(&self) -> &str {
102        self.uri.path()
103    }
104
105    #[inline]
106    pub fn query(&self) -> Option<&str> {
107        self.uri.query()
108    }
109
110    #[inline]
111    pub fn port(&self) -> Option<u16> {
112        self.uri.port_u16()
113    }
114
115    #[inline]
116    pub fn port_or_known_default(&self) -> Option<u16> {
117        self.port().or_else(|| known_default_port(self.scheme()))
118    }
119
120    pub fn host(&self) -> Option<Host> {
121        let authority = self.uri.authority()?.as_str();
122        parse_host_port(authority).ok().map(|(host, _)| host)
123    }
124
125    #[inline]
126    pub fn host_str(&self) -> Option<&str> {
127        self.host_str.as_deref()
128    }
129
130    pub fn set_scheme(&mut self, scheme: &str) -> Result<(), ParseError> {
131        if scheme.is_empty() || !scheme.bytes().all(is_scheme_byte) {
132            return Err(ParseError::new("invalid URL scheme"));
133        }
134        let authority = self
135            .uri
136            .authority()
137            .map(|a| a.as_str())
138            .unwrap_or("")
139            .to_string();
140        let path_and_query = path_and_query_of(&self.uri);
141        *self = Self::assemble(scheme, &authority, &path_and_query)?;
142        Ok(())
143    }
144
145    pub fn set_port(&mut self, port: Option<u16>) -> Result<(), ParseError> {
146        let scheme = self.scheme();
147        let authority = format_authority_host(self.host(), port)?;
148        let path_and_query = path_and_query_of(&self.uri);
149        *self = Self::assemble(scheme, &authority, &path_and_query)?;
150        Ok(())
151    }
152
153    pub fn set_host(&mut self, host: Option<&str>) -> Result<(), ParseError> {
154        let host = host.ok_or_else(|| ParseError::new("missing URL host"))?;
155        validate_authority(host)?;
156        let scheme = self.scheme();
157        let host = parse_host_label(host)?;
158        let authority = format_authority_host(Some(host), self.port())?;
159        let path_and_query = path_and_query_of(&self.uri);
160        *self = Self::assemble(scheme, &authority, &path_and_query)?;
161        Ok(())
162    }
163
164    pub fn set_query(&mut self, query: Option<&str>) -> Result<(), ParseError> {
165        let scheme = self.scheme();
166        let authority = self
167            .uri
168            .authority()
169            .map(|a| a.as_str())
170            .unwrap_or("")
171            .to_string();
172        let mut path_and_query = self.path().to_string();
173        if let Some(q) = query {
174            path_and_query.push('?');
175            path_and_query.push_str(q);
176        }
177        *self = Self::assemble(scheme, &authority, &path_and_query)?;
178        Ok(())
179    }
180
181    /// RFC 3986 section 5.2 reference resolution against this base URL.
182    pub fn join(&self, reference: &str) -> Result<Self, ParseError> {
183        let reference = reference.split('#').next().unwrap_or(reference);
184        if scheme_end(reference).is_some() {
185            return Self::parse(reference);
186        }
187
188        let base_scheme = self.scheme();
189        let base_authority = self.uri.authority().map(|a| a.as_str()).unwrap_or("");
190        let base_path = self.path();
191        let base_query = self.query();
192
193        if let Some(rest) = reference.strip_prefix("//") {
194            let (authority, path, query) = split_authority_reference(rest)?;
195            let path = normalize_path(&path);
196            return Self::assemble_with_query(base_scheme, &authority, &path, query.as_deref());
197        }
198
199        if reference.starts_with('/') {
200            let (path, query) = split_path_query(reference);
201            let path = normalize_path(path);
202            return Self::assemble_with_query(base_scheme, base_authority, &path, query.as_deref());
203        }
204
205        if let Some(query) = reference.strip_prefix('?') {
206            return Self::assemble_with_query(base_scheme, base_authority, base_path, Some(query));
207        }
208
209        if reference.is_empty() {
210            return Self::assemble_with_query(base_scheme, base_authority, base_path, base_query);
211        }
212
213        let (ref_path, ref_query) = split_path_query(reference);
214        let merged_path = normalize_path(&merge_paths(base_path, ref_path));
215        Self::assemble_with_query(
216            base_scheme,
217            base_authority,
218            &merged_path,
219            ref_query.as_deref(),
220        )
221    }
222
223    fn assemble(scheme: &str, authority: &str, path_and_query: &str) -> Result<Self, ParseError> {
224        let (path, query) = split_path_query(path_and_query);
225        Self::assemble_with_query(scheme, authority, path, query.as_deref())
226    }
227
228    fn assemble_with_query(
229        scheme: &str,
230        authority: &str,
231        path: &str,
232        query: Option<&str>,
233    ) -> Result<Self, ParseError> {
234        let path = if path.is_empty() { "/" } else { path };
235        let mut inner = format!("{scheme}://");
236        if !authority.is_empty() {
237            inner.push_str(authority);
238        }
239        inner.push_str(path);
240        if let Some(query) = query {
241            inner.push('?');
242            inner.push_str(query);
243        }
244        Self::parse(&inner)
245    }
246}
247
248fn path_and_query_of(uri: &Uri) -> String {
249    match uri.path_and_query() {
250        Some(pq) => pq.as_str().to_string(),
251        None => "/".to_string(),
252    }
253}
254
255fn known_default_port(scheme: &str) -> Option<u16> {
256    match scheme {
257        "http" | "ws" => Some(80),
258        "https" | "wss" => Some(443),
259        _ => None,
260    }
261}
262
263fn is_scheme_byte(byte: u8) -> bool {
264    byte.is_ascii_alphanumeric() || matches!(byte, b'+' | b'-' | b'.')
265}
266
267fn scheme_end(input: &str) -> Option<usize> {
268    if !input
269        .chars()
270        .next()
271        .is_some_and(|c| c.is_ascii_alphabetic())
272    {
273        return None;
274    }
275    let mut end = 0;
276    for (idx, ch) in input.char_indices().skip(1) {
277        if is_scheme_byte(ch as u8) {
278            end = idx + ch.len_utf8();
279        } else if ch == ':' {
280            return Some(end);
281        } else {
282            return None;
283        }
284    }
285    None
286}
287
288fn extract_authority(input: &str) -> Option<&str> {
289    let scheme_sep = input.find("://")?;
290    let after_scheme = &input[scheme_sep + 3..];
291    after_scheme
292        .split(&['/', '?'][..])
293        .next()
294        .filter(|part| !part.is_empty())
295}
296
297fn validate_authority(authority: &str) -> Result<(), ParseError> {
298    if authority.contains('@') {
299        return Err(ParseError::new(
300            "userinfo in URL authority is not supported",
301        ));
302    }
303    if !authority.is_ascii() {
304        return Err(ParseError::new("non-ASCII host requires explicit punycode"));
305    }
306    Ok(())
307}
308
309fn host_str_from_authority(authority: &str) -> Result<String, ParseError> {
310    let (host, _) = parse_host_port(authority)?;
311    Ok(match host {
312        Host::Domain(domain) => domain,
313        Host::Ipv4(addr) => addr.to_string(),
314        Host::Ipv6(addr) => addr.to_string(),
315    })
316}
317
318fn parse_host_port(authority: &str) -> Result<(Host, Option<u16>), ParseError> {
319    if authority.is_empty() {
320        return Err(ParseError::new("missing URL host"));
321    }
322
323    if authority.starts_with('[') {
324        let end = authority
325            .find(']')
326            .ok_or_else(|| ParseError::new("invalid IPv6 authority"))?;
327        let ip = Ipv6Addr::from_str(&authority[1..end])
328            .map_err(|_| ParseError::new("invalid IPv6 address"))?;
329        let port = parse_port_suffix(&authority[end + 1..])?;
330        return Ok((Host::Ipv6(ip), port));
331    }
332
333    if let Some((host, port)) = authority.rsplit_once(':') {
334        if !host.is_empty() && port.chars().all(|c| c.is_ascii_digit()) {
335            let port = port
336                .parse::<u16>()
337                .map_err(|_| ParseError::new("invalid port"))?;
338            return Ok((parse_host_label(host)?, Some(port)));
339        }
340    }
341
342    Ok((parse_host_label(authority)?, None))
343}
344
345fn parse_host_label(host: &str) -> Result<Host, ParseError> {
346    if let Ok(ip) = Ipv4Addr::from_str(host) {
347        return Ok(Host::Ipv4(ip));
348    }
349    Ok(Host::Domain(host.to_ascii_lowercase()))
350}
351
352fn parse_port_suffix(suffix: &str) -> Result<Option<u16>, ParseError> {
353    if suffix.is_empty() {
354        return Ok(None);
355    }
356    if !suffix.starts_with(':') {
357        return Err(ParseError::new("invalid port suffix"));
358    }
359    suffix[1..]
360        .parse::<u16>()
361        .map(Some)
362        .map_err(|_| ParseError::new("invalid port"))
363}
364
365fn format_authority_host(host: Option<Host>, port: Option<u16>) -> Result<String, ParseError> {
366    let host = host.ok_or_else(|| ParseError::new("missing URL host"))?;
367    let mut authority = match host {
368        Host::Domain(domain) => domain,
369        Host::Ipv4(addr) => addr.to_string(),
370        Host::Ipv6(addr) => format!("[{addr}]"),
371    };
372    if let Some(port) = port {
373        authority.push(':');
374        authority.push_str(&port.to_string());
375    }
376    Ok(authority)
377}
378
379fn split_path_query(input: &str) -> (&str, Option<String>) {
380    match input.split_once('?') {
381        Some((path, query)) => (path, Some(query.to_string())),
382        None => (input, None),
383    }
384}
385
386fn split_authority_reference(input: &str) -> Result<(String, String, Option<String>), ParseError> {
387    let authority_end = input
388        .find('/')
389        .or_else(|| input.find('?'))
390        .unwrap_or(input.len());
391    let authority = &input[..authority_end];
392    let rest = &input[authority_end..];
393
394    if authority.is_empty() {
395        return Err(ParseError::new("missing authority in reference"));
396    }
397
398    let (path, query) = if rest.is_empty() {
399        ("/".to_string(), None)
400    } else if let Some(query) = rest.strip_prefix('?') {
401        ("/".to_string(), Some(query.to_string()))
402    } else {
403        let (path, query) = split_path_query(&rest[1..]);
404        let path = if path.is_empty() {
405            "/".to_string()
406        } else {
407            format!("/{path}")
408        };
409        (path, query)
410    };
411
412    Ok((authority.to_string(), path, query))
413}
414
415fn merge_paths(base_path: &str, reference_path: &str) -> String {
416    let prefix = if let Some(idx) = base_path.rfind('/') {
417        &base_path[..=idx]
418    } else {
419        ""
420    };
421    format!("{prefix}{reference_path}")
422}
423
424fn normalize_path(path: &str) -> String {
425    let (path_only, query) = split_path_query(path);
426    let normalized = remove_dot_segments(path_only);
427    match query {
428        Some(query) => format!("{normalized}?{query}"),
429        None => normalized,
430    }
431}
432
433fn remove_dot_segments(path: &str) -> String {
434    // RFC 3986 section 5.2.4 byte-walker algorithm; preserves trailing slashes
435    // that the segment-split form drops (e.g. `..` from `/a/b/c` -> `/a/`).
436    let mut input = path.to_string();
437    let mut output = String::new();
438
439    while !input.is_empty() {
440        if let Some(rest) = input.strip_prefix("../") {
441            input = rest.to_string();
442        } else if let Some(rest) = input.strip_prefix("./") {
443            input = rest.to_string();
444        } else if let Some(rest) = input.strip_prefix("/./") {
445            input = format!("/{rest}");
446        } else if input == "/." {
447            input = "/".to_string();
448        } else if let Some(rest) = input.strip_prefix("/../") {
449            input = format!("/{rest}");
450            pop_last_segment(&mut output);
451        } else if input == "/.." {
452            input = "/".to_string();
453            pop_last_segment(&mut output);
454        } else if input == "." || input == ".." {
455            input.clear();
456        } else {
457            let start = if input.starts_with('/') { 1 } else { 0 };
458            let end = match input[start..].find('/') {
459                Some(idx) => start + idx,
460                None => input.len(),
461            };
462            output.push_str(&input[..end]);
463            input = input[end..].to_string();
464        }
465    }
466
467    output
468}
469
470fn pop_last_segment(output: &mut String) {
471    // RFC 3986 5.2.4 step C: remove last segment AND its preceding `/` (if any).
472    while let Some(byte) = output.as_bytes().last() {
473        if *byte == b'/' {
474            break;
475        }
476        output.pop();
477    }
478    if output.ends_with('/') {
479        output.pop();
480    }
481}
482
483#[cfg(test)]
484mod unit_tests {
485    use super::*;
486
487    #[test]
488    fn rejects_non_ascii_authority() {
489        let err = Url::parse("https://exämple.com/").unwrap_err();
490        assert!(err.to_string().contains("non-ASCII"));
491    }
492
493    #[test]
494    fn rejects_userinfo() {
495        let err = Url::parse("http://user:pass@host/").unwrap_err();
496        assert!(err.to_string().contains("userinfo"));
497    }
498}