Skip to main content

tirith_core/
parse.rs

1use serde::{Deserialize, Serialize};
2use url::Url;
3
4/// Represents different forms of URL-like patterns found in commands.
5#[derive(Debug, Clone, Serialize, Deserialize)]
6#[serde(tag = "type", rename_all = "snake_case")]
7pub enum UrlLike {
8    /// Standard URL parsed by the `url` crate, with raw host preserved.
9    Standard {
10        #[serde(serialize_with = "serialize_url", deserialize_with = "deserialize_url")]
11        parsed: Url,
12        raw_host: String,
13    },
14    /// SCP-style git reference (e.g., `git@github.com:user/repo.git`).
15    Scp {
16        user: Option<String>,
17        host: String,
18        path: String,
19    },
20    /// Docker image reference following distribution spec.
21    DockerRef {
22        registry: Option<String>,
23        image: String,
24        tag: Option<String>,
25        digest: Option<String>,
26    },
27    /// URL that failed standard parsing but has extractable components.
28    Unparsed {
29        raw: String,
30        raw_host: Option<String>,
31        raw_path: Option<String>,
32    },
33    /// Schemeless host+path found in sink contexts (curl, wget, etc.).
34    SchemelessHostPath { host: String, path: String },
35}
36
37fn serialize_url<S>(url: &Url, serializer: S) -> Result<S::Ok, S::Error>
38where
39    S: serde::Serializer,
40{
41    serializer.serialize_str(url.as_str())
42}
43
44fn deserialize_url<'de, D>(deserializer: D) -> Result<Url, D::Error>
45where
46    D: serde::Deserializer<'de>,
47{
48    let s = String::deserialize(deserializer)?;
49    Url::parse(&s).map_err(serde::de::Error::custom)
50}
51
52impl UrlLike {
53    /// Returns the canonical host, if available.
54    pub fn host(&self) -> Option<&str> {
55        match self {
56            UrlLike::Standard { parsed, .. } => parsed.host_str(),
57            UrlLike::Scp { host, .. } => Some(host.as_str()),
58            UrlLike::DockerRef { registry, .. } => {
59                if let Some(reg) = registry {
60                    Some(reg.as_str())
61                } else {
62                    // Resolved default registry
63                    Some("docker.io")
64                }
65            }
66            UrlLike::Unparsed { raw_host, .. } => raw_host.as_deref(),
67            UrlLike::SchemelessHostPath { host, .. } => Some(host.as_str()),
68        }
69    }
70
71    /// Returns the raw (pre-IDNA) host, if available.
72    pub fn raw_host(&self) -> Option<&str> {
73        match self {
74            UrlLike::Standard { raw_host, .. } => Some(raw_host.as_str()),
75            UrlLike::Scp { host, .. } => Some(host.as_str()),
76            UrlLike::DockerRef { registry, .. } => registry.as_deref().or(Some("docker.io")),
77            UrlLike::Unparsed { raw_host, .. } => raw_host.as_deref(),
78            UrlLike::SchemelessHostPath { host, .. } => Some(host.as_str()),
79        }
80    }
81
82    /// Returns the raw string representation.
83    pub fn raw_str(&self) -> String {
84        match self {
85            UrlLike::Standard { parsed, .. } => parsed.to_string(),
86            UrlLike::Scp { user, host, path } => {
87                if let Some(u) = user {
88                    format!("{u}@{host}:{path}")
89                } else {
90                    format!("{host}:{path}")
91                }
92            }
93            UrlLike::DockerRef {
94                registry,
95                image,
96                tag,
97                digest,
98            } => {
99                let mut s = String::new();
100                if let Some(reg) = registry {
101                    s.push_str(reg);
102                    s.push('/');
103                }
104                s.push_str(image);
105                if let Some(t) = tag {
106                    s.push(':');
107                    s.push_str(t);
108                }
109                if let Some(d) = digest {
110                    s.push('@');
111                    s.push_str(d);
112                }
113                s
114            }
115            UrlLike::Unparsed { raw, .. } => raw.clone(),
116            UrlLike::SchemelessHostPath { host, path } => {
117                format!("{host}{path}")
118            }
119        }
120    }
121
122    /// Returns the scheme if available.
123    pub fn scheme(&self) -> Option<&str> {
124        match self {
125            UrlLike::Standard { parsed, .. } => Some(parsed.scheme()),
126            _ => None,
127        }
128    }
129
130    /// Returns the path component if available.
131    pub fn path(&self) -> Option<&str> {
132        match self {
133            UrlLike::Standard { parsed, .. } => Some(parsed.path()),
134            UrlLike::Scp { path, .. } => Some(path.as_str()),
135            UrlLike::Unparsed { raw_path, .. } => raw_path.as_deref(),
136            UrlLike::SchemelessHostPath { path, .. } => Some(path.as_str()),
137            UrlLike::DockerRef { .. } => None,
138        }
139    }
140
141    /// Returns port if available.
142    pub fn port(&self) -> Option<u16> {
143        match self {
144            UrlLike::Standard { parsed, .. } => parsed.port(),
145            _ => None,
146        }
147    }
148
149    /// Returns userinfo if available.
150    pub fn userinfo(&self) -> Option<&str> {
151        match self {
152            UrlLike::Standard { parsed, .. } => {
153                let user = parsed.username();
154                if user.is_empty() {
155                    None
156                } else {
157                    Some(user)
158                }
159            }
160            UrlLike::Scp { user, .. } => user.as_deref(),
161            _ => None,
162        }
163    }
164}
165
166/// Extract raw authority (host portion) from a URL string before IDNA normalization.
167/// Handles IPv6, userinfo, port, and percent-encoded separators.
168pub fn extract_raw_host(url_str: &str) -> Option<String> {
169    // Find the authority section: after "scheme://"
170    let after_scheme = if let Some(idx) = url_str.find("://") {
171        &url_str[idx + 3..]
172    } else {
173        return None;
174    };
175
176    // Find end of authority (first `/`, `?`, `#`, or end of string)
177    let authority_end = after_scheme
178        .find(['/', '?', '#'])
179        .unwrap_or(after_scheme.len());
180    let authority = &after_scheme[..authority_end];
181
182    if authority.is_empty() {
183        return Some(String::new());
184    }
185
186    // Split off userinfo: find LAST unencoded `@`
187    let host_part = split_userinfo(authority);
188
189    // Extract host from host_part (handle IPv6, port)
190    let host = extract_host_from_hostport(host_part);
191
192    Some(host.to_string())
193}
194
195/// Split userinfo from authority, returning the host+port part.
196/// Finds the last unencoded `@` (percent-encoded `%40` is NOT a separator).
197fn split_userinfo(authority: &str) -> &str {
198    let bytes = authority.as_bytes();
199    let mut last_at = None;
200    let mut i = 0;
201    while i < bytes.len() {
202        if bytes[i] == b'%' && i + 2 < bytes.len() {
203            // Skip percent-encoded triplet
204            i += 3;
205            continue;
206        }
207        if bytes[i] == b'@' {
208            last_at = Some(i);
209        }
210        i += 1;
211    }
212    match last_at {
213        Some(idx) => &authority[idx + 1..],
214        None => authority,
215    }
216}
217
218/// Extract host from a host:port string, handling IPv6 brackets.
219fn extract_host_from_hostport(hostport: &str) -> &str {
220    if hostport.starts_with('[') {
221        // IPv6: find closing bracket
222        if let Some(bracket_end) = hostport.find(']') {
223            return &hostport[..bracket_end + 1];
224        }
225        return hostport;
226    }
227
228    // Find last unencoded `:` for port separation
229    let bytes = hostport.as_bytes();
230    let mut last_colon = None;
231    let mut i = 0;
232    while i < bytes.len() {
233        if bytes[i] == b'%' && i + 2 < bytes.len() {
234            i += 3;
235            continue;
236        }
237        if bytes[i] == b':' {
238            last_colon = Some(i);
239        }
240        i += 1;
241    }
242
243    match last_colon {
244        Some(idx) => {
245            // Verify what follows looks like a port number
246            let after = &hostport[idx + 1..];
247            if after.chars().all(|c| c.is_ascii_digit()) && !after.is_empty() {
248                &hostport[..idx]
249            } else {
250                hostport
251            }
252        }
253        None => hostport,
254    }
255}
256
257/// Parse a URL string into a UrlLike.
258pub fn parse_url(raw: &str) -> UrlLike {
259    // Try SCP-style git reference: user@host:path (no scheme)
260    if let Some(scp) = try_parse_scp(raw) {
261        return scp;
262    }
263
264    // Try standard URL parsing
265    if let Ok(parsed) = Url::parse(raw) {
266        let raw_host = extract_raw_host(raw).unwrap_or_default();
267        return UrlLike::Standard { parsed, raw_host };
268    }
269
270    // Fallback: try to extract raw components
271    let raw_host = extract_raw_host(raw);
272    let raw_path = extract_raw_path(raw);
273    UrlLike::Unparsed {
274        raw: raw.to_string(),
275        raw_host,
276        raw_path,
277    }
278}
279
280/// Try to parse as SCP-style reference: [user@]host:path
281fn try_parse_scp(raw: &str) -> Option<UrlLike> {
282    // Must not have a scheme
283    if raw.contains("://") {
284        return None;
285    }
286
287    // Pattern: [user@]host:path where path doesn't start with //
288    let (user_host, path) = raw.split_once(':')?;
289    if path.starts_with("//") {
290        return None; // Looks like a scheme-relative URL
291    }
292
293    // Must have a host that looks like a domain
294    let (user, host) = if let Some((u, h)) = user_host.split_once('@') {
295        (Some(u.to_string()), h)
296    } else {
297        (None, user_host)
298    };
299
300    // Host must contain a dot or be a known hostname pattern
301    if !host.contains('.') && host != "localhost" {
302        return None;
303    }
304
305    Some(UrlLike::Scp {
306        user,
307        host: host.to_string(),
308        path: path.to_string(),
309    })
310}
311
312/// Parse a Docker image reference following distribution spec.
313pub fn parse_docker_ref(raw: &str) -> UrlLike {
314    let mut remaining = raw;
315    let mut digest = None;
316    let mut tag = None;
317
318    // Extract digest (@sha256:...)
319    if let Some(at_idx) = remaining.rfind('@') {
320        digest = Some(remaining[at_idx + 1..].to_string());
321        remaining = &remaining[..at_idx];
322    }
323
324    // Extract tag (:tag)
325    if let Some(colon_idx) = remaining.rfind(':') {
326        let potential_tag = &remaining[colon_idx + 1..];
327        // Tag must not contain '/' (that would be registry:port)
328        let before_colon = &remaining[..colon_idx];
329        // If the part after colon contains no '/' and the part before contains no ':',
330        // or if this is clearly a tag (no dots in tag portion)
331        if !potential_tag.contains('/') {
332            tag = Some(potential_tag.to_string());
333            remaining = before_colon;
334        }
335    }
336
337    // Split into components
338    let parts: Vec<&str> = remaining.split('/').collect();
339
340    let (registry, image) = if parts.len() == 1 {
341        // Single component: nginx -> docker.io/library/nginx
342        (None, format!("library/{}", parts[0]))
343    } else {
344        // Check if first component is a registry
345        let first = parts[0];
346        let is_registry = first.contains('.') || first.contains(':') || first == "localhost";
347
348        if is_registry {
349            let image_parts = &parts[1..];
350            (Some(first.to_string()), image_parts.join("/"))
351        } else {
352            // All parts form the image name, default registry
353            (None, parts.join("/"))
354        }
355    };
356
357    UrlLike::DockerRef {
358        registry,
359        image,
360        tag,
361        digest,
362    }
363}
364
365/// Extract raw path from a URL string (fallback for unparseable URLs).
366fn extract_raw_path(raw: &str) -> Option<String> {
367    if let Some(idx) = raw.find("://") {
368        let after = &raw[idx + 3..];
369        if let Some(slash) = after.find('/') {
370            return Some(after[slash..].to_string());
371        }
372    }
373    None
374}
375
376#[cfg(test)]
377mod tests {
378    use super::*;
379
380    #[test]
381    fn test_parse_standard_url() {
382        let u = parse_url("https://example.com/path?q=1");
383        assert!(matches!(u, UrlLike::Standard { .. }));
384        assert_eq!(u.host(), Some("example.com"));
385        assert_eq!(u.scheme(), Some("https"));
386        assert_eq!(u.path(), Some("/path"));
387    }
388
389    #[test]
390    fn test_raw_host_preserved() {
391        let u = parse_url("https://example.com/path");
392        if let UrlLike::Standard { raw_host, .. } = &u {
393            assert_eq!(raw_host, "example.com");
394        } else {
395            panic!("expected Standard");
396        }
397    }
398
399    #[test]
400    fn test_raw_host_ipv6() {
401        let raw = "http://[::1]:8080/path";
402        let host = extract_raw_host(raw);
403        assert_eq!(host, Some("[::1]".to_string()));
404    }
405
406    #[test]
407    fn test_raw_host_userinfo() {
408        let raw = "http://user@example.com/path";
409        let host = extract_raw_host(raw);
410        assert_eq!(host, Some("example.com".to_string()));
411    }
412
413    #[test]
414    fn test_raw_host_encoded_at() {
415        let raw = "http://user%40name@host.com/path";
416        let host = extract_raw_host(raw);
417        assert_eq!(host, Some("host.com".to_string()));
418    }
419
420    #[test]
421    fn test_raw_host_encoded_colon() {
422        let raw = "http://exam%3Aple.com/path";
423        let host = extract_raw_host(raw);
424        assert_eq!(host, Some("exam%3Aple.com".to_string()));
425    }
426
427    #[test]
428    fn test_raw_host_empty() {
429        let raw = "http:///path";
430        let host = extract_raw_host(raw);
431        assert_eq!(host, Some("".to_string()));
432    }
433
434    #[test]
435    fn test_raw_host_trailing_dot() {
436        let raw = "http://example.com./path";
437        let host = extract_raw_host(raw);
438        assert_eq!(host, Some("example.com.".to_string()));
439    }
440
441    #[test]
442    fn test_raw_host_with_port() {
443        let raw = "http://example.com:8080/path";
444        let host = extract_raw_host(raw);
445        assert_eq!(host, Some("example.com".to_string()));
446    }
447
448    #[test]
449    fn test_parse_scp() {
450        let u = parse_url("git@github.com:user/repo.git");
451        assert!(matches!(u, UrlLike::Scp { .. }));
452        assert_eq!(u.host(), Some("github.com"));
453        assert_eq!(u.path(), Some("user/repo.git"));
454    }
455
456    #[test]
457    fn test_docker_ref_single_component() {
458        let u = parse_docker_ref("nginx");
459        if let UrlLike::DockerRef {
460            registry, image, ..
461        } = &u
462        {
463            assert!(registry.is_none());
464            assert_eq!(image, "library/nginx");
465        } else {
466            panic!("expected DockerRef");
467        }
468        assert_eq!(u.host(), Some("docker.io"));
469    }
470
471    #[test]
472    fn test_docker_ref_user_image() {
473        let u = parse_docker_ref("user/image");
474        if let UrlLike::DockerRef {
475            registry, image, ..
476        } = &u
477        {
478            assert!(registry.is_none());
479            assert_eq!(image, "user/image");
480        }
481    }
482
483    #[test]
484    fn test_docker_ref_with_registry() {
485        let u = parse_docker_ref("myregistry.com/image");
486        if let UrlLike::DockerRef {
487            registry, image, ..
488        } = &u
489        {
490            assert_eq!(registry.as_deref(), Some("myregistry.com"));
491            assert_eq!(image, "image");
492        }
493    }
494
495    #[test]
496    fn test_docker_ref_localhost_port() {
497        let u = parse_docker_ref("localhost:5000/image");
498        if let UrlLike::DockerRef {
499            registry, image, ..
500        } = &u
501        {
502            assert_eq!(registry.as_deref(), Some("localhost:5000"));
503            assert_eq!(image, "image");
504        }
505    }
506
507    #[test]
508    fn test_docker_ref_with_digest() {
509        let u = parse_docker_ref("registry:5000/org/image:v1@sha256:abc123");
510        if let UrlLike::DockerRef {
511            registry,
512            image,
513            tag,
514            digest,
515        } = &u
516        {
517            assert_eq!(registry.as_deref(), Some("registry:5000"));
518            assert_eq!(image, "org/image");
519            assert_eq!(tag.as_deref(), Some("v1"));
520            assert_eq!(digest.as_deref(), Some("sha256:abc123"));
521        }
522    }
523
524    #[test]
525    fn test_docker_ref_gcr() {
526        let u = parse_docker_ref("gcr.io/project/image");
527        if let UrlLike::DockerRef {
528            registry, image, ..
529        } = &u
530        {
531            assert_eq!(registry.as_deref(), Some("gcr.io"));
532            assert_eq!(image, "project/image");
533        }
534    }
535
536    #[test]
537    fn test_docker_dotted_tag() {
538        let u = parse_docker_ref("nginx:1.25");
539        if let UrlLike::DockerRef { image, tag, .. } = &u {
540            assert_eq!(image, "library/nginx");
541            assert_eq!(tag.as_deref(), Some("1.25"));
542        } else {
543            panic!("expected DockerRef");
544        }
545    }
546
547    #[test]
548    fn test_docker_registry_port_no_tag() {
549        let u = parse_docker_ref("registry.io:5000/nginx");
550        if let UrlLike::DockerRef {
551            registry,
552            image,
553            tag,
554            ..
555        } = &u
556        {
557            assert_eq!(registry.as_deref(), Some("registry.io:5000"));
558            assert_eq!(image, "nginx");
559            assert!(tag.is_none());
560        } else {
561            panic!("expected DockerRef");
562        }
563    }
564
565    #[test]
566    fn test_docker_registry_port_with_dotted_tag() {
567        let u = parse_docker_ref("registry.io:5000/nginx:1.25");
568        if let UrlLike::DockerRef {
569            registry,
570            image,
571            tag,
572            ..
573        } = &u
574        {
575            assert_eq!(registry.as_deref(), Some("registry.io:5000"));
576            assert_eq!(image, "nginx");
577            assert_eq!(tag.as_deref(), Some("1.25"));
578        } else {
579            panic!("expected DockerRef");
580        }
581    }
582
583    #[test]
584    fn test_unparsed_fallback() {
585        let u = parse_url("not://[invalid");
586        assert!(matches!(u, UrlLike::Unparsed { .. }));
587    }
588}