Skip to main content

tirith_core/
parse.rs

1use serde::{Deserialize, Serialize};
2use url::Url;
3
4/// Represents different forms of URL-like patterns found in commands.
5#[derive(Debug, Clone, Serialize, Deserialize)]
6#[serde(tag = "type", rename_all = "snake_case")]
7pub enum UrlLike {
8    /// Standard URL parsed by the `url` crate, with raw host preserved.
9    Standard {
10        #[serde(serialize_with = "serialize_url", deserialize_with = "deserialize_url")]
11        parsed: Url,
12        raw_host: String,
13    },
14    /// SCP-style git reference (e.g., `git@github.com:user/repo.git`).
15    Scp {
16        user: Option<String>,
17        host: String,
18        path: String,
19    },
20    /// Docker image reference following distribution spec.
21    DockerRef {
22        registry: Option<String>,
23        image: String,
24        tag: Option<String>,
25        digest: Option<String>,
26    },
27    /// URL that failed standard parsing but has extractable components.
28    Unparsed {
29        raw: String,
30        raw_host: Option<String>,
31        raw_path: Option<String>,
32    },
33    /// Schemeless host+path found in sink contexts (curl, wget, etc.).
34    SchemelessHostPath { host: String, path: String },
35}
36
37fn serialize_url<S>(url: &Url, serializer: S) -> Result<S::Ok, S::Error>
38where
39    S: serde::Serializer,
40{
41    serializer.serialize_str(url.as_str())
42}
43
44fn deserialize_url<'de, D>(deserializer: D) -> Result<Url, D::Error>
45where
46    D: serde::Deserializer<'de>,
47{
48    let s = String::deserialize(deserializer)?;
49    Url::parse(&s).map_err(serde::de::Error::custom)
50}
51
52impl UrlLike {
53    /// Returns the canonical host, if available.
54    pub fn host(&self) -> Option<&str> {
55        match self {
56            UrlLike::Standard { parsed, .. } => parsed.host_str(),
57            UrlLike::Scp { host, .. } => Some(host.as_str()),
58            UrlLike::DockerRef { registry, .. } => {
59                if let Some(reg) = registry {
60                    Some(reg.as_str())
61                } else {
62                    // Docker distribution spec default registry.
63                    Some("docker.io")
64                }
65            }
66            UrlLike::Unparsed { raw_host, .. } => raw_host.as_deref(),
67            UrlLike::SchemelessHostPath { host, .. } => Some(host.as_str()),
68        }
69    }
70
71    /// Returns the raw (pre-IDNA) host, if available.
72    pub fn raw_host(&self) -> Option<&str> {
73        match self {
74            UrlLike::Standard { raw_host, .. } => Some(raw_host.as_str()),
75            UrlLike::Scp { host, .. } => Some(host.as_str()),
76            UrlLike::DockerRef { registry, .. } => registry.as_deref().or(Some("docker.io")),
77            UrlLike::Unparsed { raw_host, .. } => raw_host.as_deref(),
78            UrlLike::SchemelessHostPath { host, .. } => Some(host.as_str()),
79        }
80    }
81
82    /// Returns the raw string representation.
83    pub fn raw_str(&self) -> String {
84        match self {
85            UrlLike::Standard { parsed, .. } => parsed.to_string(),
86            UrlLike::Scp { user, host, path } => {
87                if let Some(u) = user {
88                    format!("{u}@{host}:{path}")
89                } else {
90                    format!("{host}:{path}")
91                }
92            }
93            UrlLike::DockerRef {
94                registry,
95                image,
96                tag,
97                digest,
98            } => {
99                let mut s = String::new();
100                if let Some(reg) = registry {
101                    s.push_str(reg);
102                    s.push('/');
103                }
104                s.push_str(image);
105                if let Some(t) = tag {
106                    s.push(':');
107                    s.push_str(t);
108                }
109                if let Some(d) = digest {
110                    s.push('@');
111                    s.push_str(d);
112                }
113                s
114            }
115            UrlLike::Unparsed { raw, .. } => raw.clone(),
116            UrlLike::SchemelessHostPath { host, path } => {
117                format!("{host}{path}")
118            }
119        }
120    }
121
122    /// Returns the scheme if available.
123    pub fn scheme(&self) -> Option<&str> {
124        match self {
125            UrlLike::Standard { parsed, .. } => Some(parsed.scheme()),
126            _ => None,
127        }
128    }
129
130    /// Returns the path component if available.
131    pub fn path(&self) -> Option<&str> {
132        match self {
133            UrlLike::Standard { parsed, .. } => Some(parsed.path()),
134            UrlLike::Scp { path, .. } => Some(path.as_str()),
135            UrlLike::Unparsed { raw_path, .. } => raw_path.as_deref(),
136            UrlLike::SchemelessHostPath { path, .. } => Some(path.as_str()),
137            UrlLike::DockerRef { .. } => None,
138        }
139    }
140
141    /// Returns port if available.
142    pub fn port(&self) -> Option<u16> {
143        match self {
144            UrlLike::Standard { parsed, .. } => parsed.port(),
145            _ => None,
146        }
147    }
148
149    /// Returns userinfo if available.
150    pub fn userinfo(&self) -> Option<&str> {
151        match self {
152            UrlLike::Standard { parsed, .. } => {
153                let user = parsed.username();
154                if user.is_empty() {
155                    None
156                } else {
157                    Some(user)
158                }
159            }
160            UrlLike::Scp { user, .. } => user.as_deref(),
161            _ => None,
162        }
163    }
164}
165
166/// Extract the raw authority (host portion) from a URL string BEFORE IDNA
167/// normalization. Handles IPv6, userinfo, port, and percent-encoded separators.
168///
169/// Needed because `Url::parse` IDNA-normalizes the host, which would hide
170/// homograph/punycode signals that the hostname rules depend on.
171pub fn extract_raw_host(url_str: &str) -> Option<String> {
172    let after_scheme = if let Some(idx) = url_str.find("://") {
173        &url_str[idx + 3..]
174    } else {
175        return None;
176    };
177
178    let authority_end = after_scheme
179        .find(['/', '?', '#'])
180        .unwrap_or(after_scheme.len());
181    let authority = &after_scheme[..authority_end];
182
183    if authority.is_empty() {
184        return Some(String::new());
185    }
186
187    let host_part = split_userinfo(authority);
188    let host = extract_host_from_hostport(host_part);
189
190    Some(host.to_string())
191}
192
193/// Split userinfo from authority, returning the host+port part.
194///
195/// Splits on the LAST unencoded `@` so that `user%40name@host` resolves to
196/// `host` (percent-encoded `%40` is part of the userinfo, not a separator).
197fn split_userinfo(authority: &str) -> &str {
198    let bytes = authority.as_bytes();
199    let mut last_at = None;
200    let mut i = 0;
201    while i < bytes.len() {
202        if bytes[i] == b'%' && i + 2 < bytes.len() {
203            i += 3;
204            continue;
205        }
206        if bytes[i] == b'@' {
207            last_at = Some(i);
208        }
209        i += 1;
210    }
211    match last_at {
212        Some(idx) => &authority[idx + 1..],
213        None => authority,
214    }
215}
216
217/// Extract host from a host:port string, handling IPv6 brackets.
218fn extract_host_from_hostport(hostport: &str) -> &str {
219    if hostport.starts_with('[') {
220        if let Some(bracket_end) = hostport.find(']') {
221            return &hostport[..bracket_end + 1];
222        }
223        return hostport;
224    }
225
226    let bytes = hostport.as_bytes();
227    let mut last_colon = None;
228    let mut i = 0;
229    while i < bytes.len() {
230        if bytes[i] == b'%' && i + 2 < bytes.len() {
231            i += 3;
232            continue;
233        }
234        if bytes[i] == b':' {
235            last_colon = Some(i);
236        }
237        i += 1;
238    }
239
240    match last_colon {
241        Some(idx) => {
242            // Only treat the colon as a port separator when the suffix is all digits,
243            // otherwise `foo:bar` would be split incorrectly.
244            let after = &hostport[idx + 1..];
245            if after.chars().all(|c| c.is_ascii_digit()) && !after.is_empty() {
246                &hostport[..idx]
247            } else {
248                hostport
249            }
250        }
251        None => hostport,
252    }
253}
254
255/// Parse a URL string into a UrlLike.
256pub fn parse_url(raw: &str) -> UrlLike {
257    // SCP-style refs (`git@host:path`) look nothing like standard URLs and must
258    // be matched before the generic `Url::parse`.
259    if let Some(scp) = try_parse_scp(raw) {
260        return scp;
261    }
262
263    if let Ok(parsed) = Url::parse(raw) {
264        let raw_host = extract_raw_host(raw).unwrap_or_default();
265        return UrlLike::Standard { parsed, raw_host };
266    }
267
268    let raw_host = extract_raw_host(raw);
269    let raw_path = extract_raw_path(raw);
270    UrlLike::Unparsed {
271        raw: raw.to_string(),
272        raw_host,
273        raw_path,
274    }
275}
276
277/// Try to parse as SCP-style reference: `[user@]host:path`.
278fn try_parse_scp(raw: &str) -> Option<UrlLike> {
279    if raw.contains("://") {
280        return None;
281    }
282
283    let (user_host, path) = raw.split_once(':')?;
284    if path.starts_with("//") {
285        // Looks like a scheme-relative URL, not SCP.
286        return None;
287    }
288
289    let (user, host) = if let Some((u, h)) = user_host.split_once('@') {
290        (Some(u.to_string()), h)
291    } else {
292        (None, user_host)
293    };
294
295    // Require a dot or `localhost` so we don't mistake `foo:bar` shell syntax
296    // (e.g. `make:build`) for an SCP ref.
297    if !host.contains('.') && host != "localhost" {
298        return None;
299    }
300
301    Some(UrlLike::Scp {
302        user,
303        host: host.to_string(),
304        path: path.to_string(),
305    })
306}
307
308/// Parse a Docker image reference following distribution spec.
309pub fn parse_docker_ref(raw: &str) -> UrlLike {
310    let mut remaining = raw;
311    let mut digest = None;
312    let mut tag = None;
313
314    if let Some(at_idx) = remaining.rfind('@') {
315        digest = Some(remaining[at_idx + 1..].to_string());
316        remaining = &remaining[..at_idx];
317    }
318
319    if let Some(colon_idx) = remaining.rfind(':') {
320        let potential_tag = &remaining[colon_idx + 1..];
321        let before_colon = &remaining[..colon_idx];
322        // `/` in the suffix would mean this colon is a registry:port separator,
323        // not a tag.
324        if !potential_tag.contains('/') {
325            tag = Some(potential_tag.to_string());
326            remaining = before_colon;
327        }
328    }
329
330    let parts: Vec<&str> = remaining.split('/').collect();
331
332    let (registry, image) = if parts.len() == 1 {
333        // `nginx` → `docker.io/library/nginx` (implicit `library/` namespace).
334        (None, format!("library/{}", parts[0]))
335    } else {
336        let first = parts[0];
337        // A host-like first segment (has `.`/`:` or is `localhost`) is a registry;
338        // otherwise every segment is part of the image name.
339        let is_registry = first.contains('.') || first.contains(':') || first == "localhost";
340
341        if is_registry {
342            let image_parts = &parts[1..];
343            (Some(first.to_string()), image_parts.join("/"))
344        } else {
345            (None, parts.join("/"))
346        }
347    };
348
349    UrlLike::DockerRef {
350        registry,
351        image,
352        tag,
353        digest,
354    }
355}
356
357/// Extract raw path from a URL string (fallback for unparseable URLs).
358fn extract_raw_path(raw: &str) -> Option<String> {
359    if let Some(idx) = raw.find("://") {
360        let after = &raw[idx + 3..];
361        if let Some(slash) = after.find('/') {
362            return Some(after[slash..].to_string());
363        }
364    }
365    None
366}
367
368#[cfg(test)]
369mod tests {
370    use super::*;
371
372    #[test]
373    fn test_parse_standard_url() {
374        let u = parse_url("https://example.com/path?q=1");
375        assert!(matches!(u, UrlLike::Standard { .. }));
376        assert_eq!(u.host(), Some("example.com"));
377        assert_eq!(u.scheme(), Some("https"));
378        assert_eq!(u.path(), Some("/path"));
379    }
380
381    #[test]
382    fn test_raw_host_preserved() {
383        let u = parse_url("https://example.com/path");
384        if let UrlLike::Standard { raw_host, .. } = &u {
385            assert_eq!(raw_host, "example.com");
386        } else {
387            panic!("expected Standard");
388        }
389    }
390
391    #[test]
392    fn test_raw_host_ipv6() {
393        let raw = "http://[::1]:8080/path";
394        let host = extract_raw_host(raw);
395        assert_eq!(host, Some("[::1]".to_string()));
396    }
397
398    #[test]
399    fn test_raw_host_userinfo() {
400        let raw = "http://user@example.com/path";
401        let host = extract_raw_host(raw);
402        assert_eq!(host, Some("example.com".to_string()));
403    }
404
405    #[test]
406    fn test_raw_host_encoded_at() {
407        let raw = "http://user%40name@host.com/path";
408        let host = extract_raw_host(raw);
409        assert_eq!(host, Some("host.com".to_string()));
410    }
411
412    #[test]
413    fn test_raw_host_encoded_colon() {
414        let raw = "http://exam%3Aple.com/path";
415        let host = extract_raw_host(raw);
416        assert_eq!(host, Some("exam%3Aple.com".to_string()));
417    }
418
419    #[test]
420    fn test_raw_host_empty() {
421        let raw = "http:///path";
422        let host = extract_raw_host(raw);
423        assert_eq!(host, Some("".to_string()));
424    }
425
426    #[test]
427    fn test_raw_host_trailing_dot() {
428        let raw = "http://example.com./path";
429        let host = extract_raw_host(raw);
430        assert_eq!(host, Some("example.com.".to_string()));
431    }
432
433    #[test]
434    fn test_raw_host_with_port() {
435        let raw = "http://example.com:8080/path";
436        let host = extract_raw_host(raw);
437        assert_eq!(host, Some("example.com".to_string()));
438    }
439
440    #[test]
441    fn test_parse_scp() {
442        let u = parse_url("git@github.com:user/repo.git");
443        assert!(matches!(u, UrlLike::Scp { .. }));
444        assert_eq!(u.host(), Some("github.com"));
445        assert_eq!(u.path(), Some("user/repo.git"));
446    }
447
448    #[test]
449    fn test_docker_ref_single_component() {
450        let u = parse_docker_ref("nginx");
451        if let UrlLike::DockerRef {
452            registry, image, ..
453        } = &u
454        {
455            assert!(registry.is_none());
456            assert_eq!(image, "library/nginx");
457        } else {
458            panic!("expected DockerRef");
459        }
460        assert_eq!(u.host(), Some("docker.io"));
461    }
462
463    #[test]
464    fn test_docker_ref_user_image() {
465        let u = parse_docker_ref("user/image");
466        if let UrlLike::DockerRef {
467            registry, image, ..
468        } = &u
469        {
470            assert!(registry.is_none());
471            assert_eq!(image, "user/image");
472        }
473    }
474
475    #[test]
476    fn test_docker_ref_with_registry() {
477        let u = parse_docker_ref("myregistry.com/image");
478        if let UrlLike::DockerRef {
479            registry, image, ..
480        } = &u
481        {
482            assert_eq!(registry.as_deref(), Some("myregistry.com"));
483            assert_eq!(image, "image");
484        }
485    }
486
487    #[test]
488    fn test_docker_ref_localhost_port() {
489        let u = parse_docker_ref("localhost:5000/image");
490        if let UrlLike::DockerRef {
491            registry, image, ..
492        } = &u
493        {
494            assert_eq!(registry.as_deref(), Some("localhost:5000"));
495            assert_eq!(image, "image");
496        }
497    }
498
499    #[test]
500    fn test_docker_ref_with_digest() {
501        let u = parse_docker_ref("registry:5000/org/image:v1@sha256:abc123");
502        if let UrlLike::DockerRef {
503            registry,
504            image,
505            tag,
506            digest,
507        } = &u
508        {
509            assert_eq!(registry.as_deref(), Some("registry:5000"));
510            assert_eq!(image, "org/image");
511            assert_eq!(tag.as_deref(), Some("v1"));
512            assert_eq!(digest.as_deref(), Some("sha256:abc123"));
513        }
514    }
515
516    #[test]
517    fn test_docker_ref_gcr() {
518        let u = parse_docker_ref("gcr.io/project/image");
519        if let UrlLike::DockerRef {
520            registry, image, ..
521        } = &u
522        {
523            assert_eq!(registry.as_deref(), Some("gcr.io"));
524            assert_eq!(image, "project/image");
525        }
526    }
527
528    #[test]
529    fn test_docker_dotted_tag() {
530        let u = parse_docker_ref("nginx:1.25");
531        if let UrlLike::DockerRef { image, tag, .. } = &u {
532            assert_eq!(image, "library/nginx");
533            assert_eq!(tag.as_deref(), Some("1.25"));
534        } else {
535            panic!("expected DockerRef");
536        }
537    }
538
539    #[test]
540    fn test_docker_registry_port_no_tag() {
541        let u = parse_docker_ref("registry.io:5000/nginx");
542        if let UrlLike::DockerRef {
543            registry,
544            image,
545            tag,
546            ..
547        } = &u
548        {
549            assert_eq!(registry.as_deref(), Some("registry.io:5000"));
550            assert_eq!(image, "nginx");
551            assert!(tag.is_none());
552        } else {
553            panic!("expected DockerRef");
554        }
555    }
556
557    #[test]
558    fn test_docker_registry_port_with_dotted_tag() {
559        let u = parse_docker_ref("registry.io:5000/nginx:1.25");
560        if let UrlLike::DockerRef {
561            registry,
562            image,
563            tag,
564            ..
565        } = &u
566        {
567            assert_eq!(registry.as_deref(), Some("registry.io:5000"));
568            assert_eq!(image, "nginx");
569            assert_eq!(tag.as_deref(), Some("1.25"));
570        } else {
571            panic!("expected DockerRef");
572        }
573    }
574
575    #[test]
576    fn test_unparsed_fallback() {
577        let u = parse_url("not://[invalid");
578        assert!(matches!(u, UrlLike::Unparsed { .. }));
579    }
580}