Skip to main content

git_remote_object_store/
url.rs

1//! Parser for the `s3+https` / `s3+http` / `az+https` / `az+http` URL
2//! grammar.
3//!
4//! The parser strips the backend prefix (`s3+` or `az+`), parses the
5//! remainder as an RFC 3986 URL via the [`url`] crate, then layers
6//! cleartext-HTTP gating, backend-specific name validation,
7//! addressing-style detection, and query-flag extraction on top. The
8//! user-facing grammar reference is `docs/getting-started.md`.
9
10use std::env;
11use std::fmt;
12use std::num::NonZeroU64;
13use std::str::FromStr;
14
15use thiserror::Error;
16use url::Url;
17
18/// Environment override that allows cleartext `*+http://` URLs against
19/// non-loopback hosts. Accepted when set to any of the truthy values
20/// recognised by [`parse_bool_value`] (`1`, `true`, `yes`, `on`,
21/// case-insensitive). Any other value — including the empty string,
22/// `0`, `false`, `no`, `off`, or any unrecognised token — is treated
23/// as "not set" and the cleartext-HTTP gate stays closed.
24pub const ENV_ALLOW_HTTP: &str = "GIT_REMOTE_OBJECT_STORE_ALLOW_HTTP";
25
26/// Maximum accepted value for `?bundle_uri_presign_ttl=<seconds>`: 7
27/// days, in seconds. Pinned at the URL boundary so the value cannot
28/// reach the backend SDKs as a degenerate input.
29///
30/// AWS enforces a 7-day ceiling on presigned URLs as part of the
31/// `SigV4` specification; passing anything larger to
32/// `aws_sdk_s3::presigning::PresigningConfig::expires_in` fails with
33/// `expires_in must be less than or equal to 604800 seconds`. Azure
34/// service-SAS does not have a comparable spec-mandated cap, but a
35/// pathological caller-supplied TTL (e.g. `u64::MAX`) caused a panic
36/// in [`crate::object_store::azure::sas::build_blob_sas_url`] via
37/// `time::Duration::seconds_f64` overflow. Applying the same 7-day
38/// cap to both backends gives consistent behaviour and a clean error
39/// at URL-parse time rather than mid-protocol (issue #219).
40pub(crate) const MAX_BUNDLE_URI_PRESIGN_TTL_SECONDS: u64 = 7 * 24 * 60 * 60;
41
42/// A parsed remote URL.
43///
44/// The `endpoint` field holds the canonical `https://` or `http://`
45/// URL that remains after stripping the backend prefix; bucket /
46/// account / container / prefix are projections of that URL.
47#[derive(Debug, Clone, PartialEq, Eq)]
48pub enum RemoteUrl {
49    /// Amazon S3 (or any S3-compatible) endpoint.
50    S3 {
51        /// Canonical RFC 3986 endpoint URL (the input minus `s3+`).
52        endpoint: Url,
53        /// Bucket name.
54        bucket: String,
55        /// Optional repository prefix within the bucket (no trailing `/`).
56        prefix: Option<String>,
57        /// Auto-detected or explicitly overridden addressing style.
58        addressing: S3Addressing,
59        /// Query-string flags.
60        flags: RemoteFlags,
61    },
62    /// Azure Blob Storage endpoint.
63    Azure {
64        /// Canonical RFC 3986 endpoint URL (the input minus `az+`).
65        endpoint: Url,
66        /// Storage-account name.
67        account: String,
68        /// Container name.
69        container: String,
70        /// Optional repository prefix within the container (no trailing `/`).
71        prefix: Option<String>,
72        /// Auto-detected or explicitly overridden addressing style.
73        addressing: AzureAddressing,
74        /// Query-string flags.
75        flags: RemoteFlags,
76    },
77}
78
79/// S3 addressing style (§3.4).
80#[derive(Debug, Clone, Copy, PartialEq, Eq)]
81pub enum S3Addressing {
82    /// `<bucket>.s3.<region>.amazonaws.com` — bucket is the leftmost
83    /// hostname label.
84    VirtualHosted,
85    /// `s3.<region>.amazonaws.com/<bucket>` — bucket is the first path
86    /// segment.
87    PathStyle,
88}
89
90/// Azure Blob addressing style (§3.4).
91#[derive(Debug, Clone, Copy, PartialEq, Eq)]
92pub enum AzureAddressing {
93    /// `<account>.blob.<endpoint-suffix>` — account is the leftmost
94    /// hostname label. Named `VirtualHosted` for symmetry with
95    /// [`S3Addressing::VirtualHosted`]; both describe the
96    /// "leftmost-hostname-label" pattern.
97    VirtualHosted,
98    /// `<host>/<account>/...` — account is the first path segment
99    /// (Azurite, custom endpoints).
100    PathStyle,
101}
102
103/// Identifies the on-bucket storage format / serialisation engine.
104///
105/// `engine` is a bucket-level property: once written to the `FORMAT` key on
106/// the first push, it is validated on every subsequent connect. The
107/// `?engine=` URL parameter is advisory — it is only meaningful when
108/// initialising a new repository. After the first push the stored value is
109/// authoritative and the URL parameter is checked for conflicts.
110#[derive(Debug, Clone, Copy, PartialEq, Eq)]
111pub enum StorageEngine {
112    /// Git bundle v2 — a text header followed by a PACK file.
113    ///
114    /// Key layout: `<prefix>/refs/heads/<branch>/<sha>.bundle`.
115    Bundle,
116    /// Incremental pack-chain engine (issue #52).
117    ///
118    /// On-bucket layout: `chain.json` (newest-first manifest) plus
119    /// `path-index.json` per ref, with content-addressed packs at
120    /// `<prefix>/packs/<sha>.{pack,idx}` and a baseline bundle for
121    /// first-push fan-out. Push, fetch, direct file access (`read_blob`
122    /// library API), compaction, and GC are all implemented; see
123    /// `src/packchain/{push,fetch,read,compact,gc}.rs`.
124    Packchain,
125}
126
127impl StorageEngine {
128    /// Every storage engine this client recognises.
129    ///
130    /// Single source of truth for diagnostics that need to enumerate
131    /// the supported set (see [`Self::supported_list_str`]). When a new
132    /// variant is added, append it here and every diagnostic that drives
133    /// its wording from this list updates automatically.
134    pub(crate) const ALL: &'static [Self] = &[Self::Bundle, Self::Packchain];
135
136    /// Parse an engine from its canonical string name. Returns `None` for
137    /// unrecognised names.
138    pub(crate) fn from_name(name: &str) -> Option<Self> {
139        Self::ALL
140            .iter()
141            .copied()
142            .find(|engine| engine.as_str() == name)
143    }
144
145    /// The canonical name for this engine, as stored in the `FORMAT` key and
146    /// accepted in the `?engine=` URL parameter.
147    #[must_use]
148    pub const fn as_str(self) -> &'static str {
149        match self {
150            Self::Bundle => "bundle",
151            Self::Packchain => "packchain",
152        }
153    }
154
155    /// Human-readable comma-separated list of every supported engine name,
156    /// each wrapped in backticks (e.g. `` "`bundle`, `packchain`" ``).
157    ///
158    /// Used by [`ParseError::UnknownEngine`] and
159    /// [`crate::protocol::backend::BackendError::UnknownStoredEngine`] so
160    /// that diagnostics stay in sync with [`Self::ALL`].
161    #[must_use]
162    pub(crate) fn supported_list_str() -> String {
163        Self::ALL
164            .iter()
165            .map(|engine| format!("`{}`", engine.as_str()))
166            .collect::<Vec<_>>()
167            .join(", ")
168    }
169}
170
171impl fmt::Display for StorageEngine {
172    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
173        f.write_str(self.as_str())
174    }
175}
176
177/// Which backend a URL (or error) refers to.
178///
179/// Used as a discriminant in [`crate::protocol::backend::BackendError`] to select
180/// S3 vs Azure error wording, and internally in `url::parse` to route the
181/// URL to the right parsing path.
182///
183/// Marked `#[non_exhaustive]` so adding a new backend (e.g. GCS) is not
184/// a breaking change for downstream `match` arms — they will see a
185/// compiler error reminding them to handle the new variant via an
186/// explicit wildcard branch rather than silently picking up the wrong
187/// behaviour.
188#[derive(Debug, Clone, Copy, PartialEq, Eq)]
189#[non_exhaustive]
190pub enum BackendKind {
191    /// Amazon S3 (or any S3-compatible) backend.
192    S3,
193    /// Azure Blob Storage backend.
194    Azure,
195}
196
197impl BackendKind {
198    /// The URL scheme prefix for this backend (`"s3+"` or `"az+"`).
199    pub(crate) const fn scheme_prefix(self) -> &'static str {
200        match self {
201            Self::S3 => "s3+",
202            Self::Azure => "az+",
203        }
204    }
205}
206
207/// Query-string flags described in §3.2 / §3.3.
208#[derive(Debug, Clone, Default, PartialEq, Eq)]
209pub struct RemoteFlags {
210    /// `?zip=1` — push uploads `repo.zip` alongside each bundle.
211    pub zip: bool,
212    /// `?profile=...` — selects a named AWS profile (S3 only).
213    pub profile: Option<String>,
214    /// `?credential=...` — names an Azure credential alias.
215    pub credential: Option<String>,
216    /// `?region=...` — overrides the SDK-derived region (rare).
217    pub region: Option<String>,
218    /// `?engine=...` — declares the storage engine for a new repository.
219    ///
220    /// On the first push to an empty bucket this value is written to the
221    /// `FORMAT` key. On subsequent connects the stored `FORMAT` value is
222    /// authoritative; a conflicting `?engine=` aborts with an error.
223    pub engine: Option<StorageEngine>,
224    /// `?bundle_uri=1` — opt in to advertising the `bundle-uri` helper
225    /// capability so a `git clone` can fetch the packchain baseline
226    /// bundle directly (e.g. via a public bucket or CDN) before the
227    /// helper protocol negotiates the incremental tail. Only meaningful
228    /// for `?engine=packchain` remotes; bundle-engine remotes ignore
229    /// the flag because their bundle filenames rotate per push and a
230    /// stable URL would race the next push.
231    pub bundle_uri: bool,
232    /// `?bundle_uri_presign_ttl=<seconds>` — when set on a packchain
233    /// remote with `?bundle_uri=1`, the helper presigns each emitted
234    /// `bundle.<ref>.uri=<url>` line with an `<seconds>`-TTL signed
235    /// URL (S3 `SigV4` or Azure service-SAS). Operators with private
236    /// buckets need this; public-read buckets and CDN-fronted
237    /// endpoints can leave it unset (the canonical URL works
238    /// directly).
239    ///
240    /// `NonZeroU64` because a zero-second TTL is meaningless (the URL
241    /// would expire before any client could observe it). The URL
242    /// parser rejects `=0` at the boundary with [`ParseError::InvalidFlagValue`].
243    /// Issue #76.
244    pub bundle_uri_presign_ttl: Option<NonZeroU64>,
245}
246
247/// Errors produced by [`parse`].
248#[derive(Debug, Error, PartialEq, Eq)]
249pub enum ParseError {
250    /// Input was empty or whitespace-only.
251    #[error("empty URL")]
252    Empty,
253    /// Scheme is not one of the four accepted values.
254    #[error("unsupported scheme `{0}`; expected `s3+https`, `s3+http`, `az+https`, or `az+http`")]
255    UnsupportedScheme(String),
256    /// The body after the backend prefix could not be parsed as a URL.
257    #[error("malformed URL: {0}")]
258    InvalidUrl(#[from] url::ParseError),
259    /// URL is missing a host component.
260    #[error("URL is missing a host")]
261    MissingHost,
262    /// S3 path-style URL is missing the first path segment (the bucket).
263    #[error("URL is missing the bucket segment")]
264    MissingBucket,
265    /// Azure virtual-hosted URL is missing the first path segment (the
266    /// container) — or path-style is missing the second path segment.
267    #[error("URL is missing the container segment")]
268    MissingContainer,
269    /// Azure path-style URL is missing the first path segment (the
270    /// account).
271    #[error("URL is missing the account segment")]
272    MissingAccount,
273    /// Bucket name does not match the S3 charset rules in §3.5.
274    #[error("invalid bucket name `{0}`")]
275    InvalidBucket(String),
276    /// Storage-account name does not match the Azure rules in §3.5.
277    #[error("invalid storage-account name `{0}`")]
278    InvalidAccount(String),
279    /// Container name does not match the Azure rules in §3.5.
280    #[error("invalid container name `{0}`")]
281    InvalidContainer(String),
282    /// Cleartext `*+http://` against a non-loopback host without the
283    /// override env var.
284    #[error(
285        "cleartext http:// is forbidden against non-loopback host `{host}`; \
286         set {ENV_ALLOW_HTTP}=1 to override"
287    )]
288    CleartextHttpForbidden {
289        /// The non-loopback host that triggered the rejection.
290        host: String,
291    },
292    /// `?addressing=` value other than `path` or `virtual`.
293    #[error("unknown addressing override `{0}`; expected `path` or `virtual`")]
294    UnknownAddressing(String),
295    /// A known flag had a value outside its accepted set.
296    #[error("invalid value for flag `{name}`: `{value}`")]
297    InvalidFlagValue {
298        /// Flag name.
299        name: String,
300        /// Offending value.
301        value: String,
302    },
303    /// A query parameter is not part of the documented flag set.
304    #[error("unknown query flag `{0}`")]
305    UnknownFlag(String),
306    /// `?engine=` value is not a recognised engine name.
307    #[error(
308        "unknown engine `{0}`; expected one of {supported}",
309        supported = StorageEngine::supported_list_str()
310    )]
311    UnknownEngine(String),
312    /// An `amazonaws.com` hostname that cannot be a valid S3 endpoint.
313    ///
314    /// Valid patterns are:
315    /// - virtual-hosted: `<bucket>.s3[.<region>].amazonaws.com`
316    /// - path-style: `s3[.<region>|-<region>].amazonaws.com`
317    #[error(
318        "hostname `{host}` is not a recognized AWS S3 endpoint; \
319         for virtual-hosted use `<bucket>.s3[.<region>].amazonaws.com`, \
320         for path-style use `s3[.<region>|-<region>].amazonaws.com`"
321    )]
322    InvalidAwsS3Endpoint {
323        /// The offending hostname.
324        host: String,
325    },
326    /// `?bundle_uri_presign_ttl=<seconds>` exceeded
327    /// [`MAX_BUNDLE_URI_PRESIGN_TTL_SECONDS`] (7 days). Rejecting at
328    /// the URL boundary prevents a degenerate value from reaching the
329    /// AWS SDK (which rejects > 7 days anyway) or the Azure SAS
330    /// builder (which previously panicked on `u64::MAX`). Issue #219.
331    #[error(
332        "bundle_uri_presign_ttl=`{value}` exceeds the 7-day maximum \
333         ({max} seconds); presigned URLs cannot be valid for longer"
334    )]
335    BundleUriPresignTtlTooLarge {
336        /// The offending value.
337        value: u64,
338        /// The maximum accepted value
339        /// ([`MAX_BUNDLE_URI_PRESIGN_TTL_SECONDS`]).
340        max: u64,
341    },
342}
343
344/// Parse a remote URL.
345///
346/// # Errors
347///
348/// Returns [`ParseError`] if the input is empty, uses an unsupported
349/// scheme, contains a malformed URL, is missing required components
350/// (host, bucket, container, account), contains invalid component names,
351/// uses an `amazonaws.com` hostname that does not match a known S3
352/// endpoint pattern, or uses cleartext `http://` against a non-loopback
353/// host without the [`ENV_ALLOW_HTTP`] environment override.
354pub fn parse(input: &str) -> Result<RemoteUrl, ParseError> {
355    let trimmed = input.trim();
356    if trimmed.is_empty() {
357        return Err(ParseError::Empty);
358    }
359
360    let (backend, body) = detect_backend(trimmed)?;
361    let endpoint = Url::parse(body)?;
362
363    let host = endpoint
364        .host_str()
365        .ok_or(ParseError::MissingHost)?
366        .to_owned();
367    if endpoint.scheme() == "http" && !is_loopback(&endpoint) && !http_allowed_by_env() {
368        return Err(ParseError::CleartextHttpForbidden { host });
369    }
370
371    let (flags, addressing_override) = extract_flags(&endpoint)?;
372
373    match backend {
374        BackendKind::S3 => finish_s3(endpoint, &host, flags, addressing_override),
375        BackendKind::Azure => finish_azure(endpoint, &host, flags, addressing_override),
376    }
377}
378
379impl FromStr for RemoteUrl {
380    type Err = ParseError;
381
382    fn from_str(s: &str) -> Result<Self, ParseError> {
383        parse(s)
384    }
385}
386
387impl fmt::Display for RemoteUrl {
388    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
389        match self {
390            Self::S3 { endpoint, .. } => write!(f, "s3+{endpoint}"),
391            Self::Azure { endpoint, .. } => write!(f, "az+{endpoint}"),
392        }
393    }
394}
395
396impl RemoteUrl {
397    /// Returns the canonical endpoint URL (without the backend prefix).
398    #[must_use]
399    pub const fn endpoint(&self) -> &Url {
400        match self {
401            Self::S3 { endpoint, .. } | Self::Azure { endpoint, .. } => endpoint,
402        }
403    }
404
405    /// Returns the optional repository prefix.
406    #[must_use]
407    pub fn prefix(&self) -> Option<&str> {
408        match self {
409            Self::S3 { prefix, .. } | Self::Azure { prefix, .. } => prefix.as_deref(),
410        }
411    }
412
413    /// Returns the parsed query flags.
414    #[must_use]
415    pub const fn flags(&self) -> &RemoteFlags {
416        match self {
417            Self::S3 { flags, .. } | Self::Azure { flags, .. } => flags,
418        }
419    }
420
421    /// Returns the backend kind discriminant.
422    #[must_use]
423    pub const fn kind(&self) -> BackendKind {
424        match self {
425            Self::S3 { .. } => BackendKind::S3,
426            Self::Azure { .. } => BackendKind::Azure,
427        }
428    }
429}
430
431// ---------------------------------------------------------------------------
432// Internals
433// ---------------------------------------------------------------------------
434
435#[derive(Debug, Clone, Copy, PartialEq, Eq)]
436enum AddressingOverride {
437    Path,
438    Virtual,
439}
440
441/// Classify the URL by its backend scheme prefix and return both the
442/// detected [`BackendKind`] and the body of the URL with the `s3+` /
443/// `az+` tag stripped. Folding the classification and the strip into
444/// one step keeps `parse()` free of an unreachable fallback for a
445/// mismatched prefix.
446///
447/// Each branch also verifies that the body starts with `https://` or
448/// `http://` so the downstream `Url::parse` sees a recognised scheme.
449fn detect_backend(input: &str) -> Result<(BackendKind, &str), ParseError> {
450    for kind in [BackendKind::S3, BackendKind::Azure] {
451        if let Some(body) = input.strip_prefix(kind.scheme_prefix())
452            && (body.starts_with("https://") || body.starts_with("http://"))
453        {
454            return Ok((kind, body));
455        }
456    }
457    Err(ParseError::UnsupportedScheme(scheme_of(input)))
458}
459
460/// Extract the part of `input` before the first `:` for error messages.
461/// Falls back to the whole string when no `:` is present.
462fn scheme_of(input: &str) -> String {
463    input.split(':').next().unwrap_or(input).to_owned()
464}
465
466fn is_loopback(u: &Url) -> bool {
467    match u.host() {
468        Some(url::Host::Domain(d)) => d.eq_ignore_ascii_case("localhost"),
469        Some(url::Host::Ipv4(ip)) => ip.is_loopback(),
470        Some(url::Host::Ipv6(ip)) => ip.is_loopback(),
471        None => false,
472    }
473}
474
475fn http_allowed_by_env() -> bool {
476    // Reuse the same vocabulary the URL boolean flags accept so
477    // `ALLOW_HTTP=true` and `ALLOW_HTTP=1` behave identically. Anything
478    // we cannot parse as a boolean (unset, empty, junk) leaves the
479    // gate closed — fail-safe is "no cleartext".
480    env::var(ENV_ALLOW_HTTP)
481        .ok()
482        .as_deref()
483        .and_then(parse_bool_value)
484        .unwrap_or(false)
485}
486
487/// Pull known flags out of the query string. Unknown keys are an error
488/// (fail-fast on typos rather than silently discard configuration).
489fn extract_flags(u: &Url) -> Result<(RemoteFlags, Option<AddressingOverride>), ParseError> {
490    let mut flags = RemoteFlags::default();
491    let mut addressing = None;
492    for (key, value) in u.query_pairs() {
493        match key.as_ref() {
494            "zip" => flags.zip = parse_bool_flag("zip", value.as_ref())?,
495            "profile" => flags.profile = Some(value.into_owned()),
496            "credential" => flags.credential = Some(value.into_owned()),
497            "region" => flags.region = Some(value.into_owned()),
498            "addressing" => {
499                addressing = Some(match value.as_ref() {
500                    "path" => AddressingOverride::Path,
501                    "virtual" => AddressingOverride::Virtual,
502                    other => return Err(ParseError::UnknownAddressing(other.to_owned())),
503                });
504            }
505            "engine" => {
506                flags.engine = Some(
507                    StorageEngine::from_name(value.as_ref())
508                        .ok_or_else(|| ParseError::UnknownEngine(value.into_owned()))?,
509                );
510            }
511            "bundle_uri" => flags.bundle_uri = parse_bool_flag("bundle_uri", value.as_ref())?,
512            "bundle_uri_presign_ttl" => {
513                flags.bundle_uri_presign_ttl = Some(parse_bundle_uri_presign_ttl(value.as_ref())?);
514            }
515            other => return Err(ParseError::UnknownFlag(other.to_owned())),
516        }
517    }
518    Ok((flags, addressing))
519}
520
521fn parse_bool_flag(name: &str, value: &str) -> Result<bool, ParseError> {
522    parse_bool_value(value).ok_or_else(|| ParseError::InvalidFlagValue {
523        name: name.to_owned(),
524        value: value.to_owned(),
525    })
526}
527
528/// Single source of truth for boolean-string parsing across the URL
529/// query-flag parser and the helper-runtime env-var reads.
530///
531/// Accepts the conventional "truthy / falsy" vocabulary used by most
532/// shells and config files (`1|true|yes|on` for true; `0|false|no|off`
533/// for false), all case-insensitively. Returns `None` for any token
534/// outside the accepted set so callers can map the failure mode they
535/// need (URL flags surface [`ParseError::InvalidFlagValue`]; env-var
536/// reads fall back to "unset/false").
537///
538/// Centralising the vocabulary here (rather than open-coding `matches!`
539/// at each read site) prevents the divergence reported in issue #187,
540/// where `?zip=true` worked in the URL but `ALLOW_HTTP=true` did not.
541fn parse_bool_value(value: &str) -> Option<bool> {
542    // One `to_ascii_lowercase` allocation plus a single match,
543    // replacing the previous 8-way `eq_ignore_ascii_case` chain (#221).
544    // Not a hot path; clarity outweighs the per-call String alloc.
545    match value.to_ascii_lowercase().as_str() {
546        "1" | "true" | "yes" | "on" => Some(true),
547        "0" | "false" | "no" | "off" => Some(false),
548        _ => None,
549    }
550}
551
552/// Parse a positive integer flag value into [`NonZeroU64`]. Rejects
553/// `0`, negative values, non-numeric junk. Used for `bundle_uri_presign_ttl`
554/// (issue #76).
555fn parse_nonzero_u64_flag(name: &str, value: &str) -> Result<NonZeroU64, ParseError> {
556    let n: u64 = value.parse().map_err(|_| ParseError::InvalidFlagValue {
557        name: name.to_owned(),
558        value: value.to_owned(),
559    })?;
560    NonZeroU64::new(n).ok_or_else(|| ParseError::InvalidFlagValue {
561        name: name.to_owned(),
562        value: value.to_owned(),
563    })
564}
565
566/// Parse `?bundle_uri_presign_ttl=<seconds>`: positive integer in
567/// `1..=MAX_BUNDLE_URI_PRESIGN_TTL_SECONDS`. The upper cap matches
568/// AWS's hard 7-day ceiling on presigned URLs and protects the Azure
569/// SAS builder from `u64`-overflow inputs (issue #219).
570fn parse_bundle_uri_presign_ttl(value: &str) -> Result<NonZeroU64, ParseError> {
571    let ttl = parse_nonzero_u64_flag("bundle_uri_presign_ttl", value)?;
572    if ttl.get() > MAX_BUNDLE_URI_PRESIGN_TTL_SECONDS {
573        return Err(ParseError::BundleUriPresignTtlTooLarge {
574            value: ttl.get(),
575            max: MAX_BUNDLE_URI_PRESIGN_TTL_SECONDS,
576        });
577    }
578    Ok(ttl)
579}
580
581/// Non-empty path segments. Segments are returned verbatim; bucket /
582/// account / container charsets cannot contain percent-encoded bytes,
583/// and the prefix is round-tripped as-stored.
584fn path_segments(u: &Url) -> Vec<String> {
585    u.path_segments()
586        .map(|iter| iter.filter(|s| !s.is_empty()).map(str::to_owned).collect())
587        .unwrap_or_default()
588}
589
590fn join_prefix(segments: &[String]) -> Option<String> {
591    if segments.is_empty() {
592        None
593    } else {
594        Some(segments.join("/"))
595    }
596}
597
598/// Set the URL's path so that [`fmt::Display`] reproduces the canonical
599/// form (with trailing `/` stripped).
600fn set_canonical_path(u: &mut Url, segments: &[&str]) {
601    u.set_path(&format!("/{}", segments.join("/")));
602}
603
604// ---------------------------------------------------------------------------
605// S3
606// ---------------------------------------------------------------------------
607
608/// AWS partition suffixes that are owned by AWS and therefore subject to
609/// `check_aws_s3_host` validation. Hosts ending in any of these must
610/// match a recognised S3 endpoint shape; hosts ending in anything else
611/// are treated as third-party S3-compatible endpoints (`MinIO`,
612/// Cloudflare R2, …) and skip the check entirely.
613///
614/// Order is irrelevant for correctness: a host that ends in
615/// `.amazonaws.com.cn` does not end in `.amazonaws.com` (the trailing
616/// `.cn` rules that out), so the two suffixes are mutually exclusive on
617/// any given host. The China entry is listed first by convention only.
618pub(crate) const AWS_HOST_SUFFIXES: &[&str] = &[".amazonaws.com.cn", ".amazonaws.com"];
619
620/// If `host` ends in one of [`AWS_HOST_SUFFIXES`], return the host with
621/// that suffix stripped; otherwise return `None`. Single source of truth
622/// for "is this an AWS partition host, and what is the leading portion?"
623pub(crate) fn strip_aws_host_suffix(host: &str) -> Option<&str> {
624    AWS_HOST_SUFFIXES
625        .iter()
626        .find_map(|suffix| host.strip_suffix(suffix))
627}
628
629/// Reject AWS hostnames (`.amazonaws.com` and `.amazonaws.com.cn`) that
630/// cannot be valid S3 endpoints.
631///
632/// Third-party S3-compatible endpoints (custom hosts, `MinIO`, R2, …)
633/// are passed through unconditionally — they do not end in an AWS
634/// partition suffix. For AWS hosts, after stripping the partition
635/// suffix the remainder must match one of:
636///
637/// - `s3` (legacy global path-style: `s3.amazonaws.com`)
638/// - `s3.<region>` (path-style with region: `s3.us-west-2.amazonaws.com`)
639/// - `s3-<region>` (legacy hyphenated path-style:
640///   `s3-us-east-1.amazonaws.com`)
641/// - end with `.s3` (no-region virtual-hosted:
642///   `<bucket>.s3.amazonaws.com`, where the trailing `.s3` label is
643///   the AWS service marker for the legacy global form)
644/// - contain `.s3.` or `.s3-` (virtual-hosted with region:
645///   `<bucket>.s3.<region>.amazonaws.com` /
646///   `<bucket>.s3-<region>.amazonaws.com`)
647///
648/// The common mistake `<bucket>.<region>.amazonaws.com` — missing the
649/// `.s3.` service marker — would otherwise silently fall through to
650/// path-style addressing with a non-existent endpoint hostname,
651/// producing an inscrutable DNS-resolution error at connect time.
652///
653/// **Policy on `?addressing=` override:** this check runs before the
654/// addressing override is applied, so `?addressing=path` (or
655/// `=virtual`) on an AWS hostname does not bypass it. AWS owns
656/// `.amazonaws.com[.cn]`; any host on those suffixes that is not a
657/// recognised S3 endpoint is a typo, and a fast-fail with the helpful
658/// `InvalidAwsS3Endpoint` error is preferable to letting the user pick
659/// any addressing style they want against a non-existent endpoint.
660fn check_aws_s3_host(host: &str) -> Result<(), ParseError> {
661    let Some(trimmed) = strip_aws_host_suffix(host) else {
662        // Not an AWS host — third-party S3-compatible endpoint, always OK.
663        return Ok(());
664    };
665
666    // `<bucket>.s3.amazonaws.com` → trimmed is `<bucket>.s3`; the last
667    // dot-separated label is "s3" (global virtual-hosted, no region).
668    // This is the only branch that catches the no-region virtual-hosted
669    // shape — it is NOT redundant with the `.s3.` / `.s3-` infix checks
670    // (which require a region segment after the marker).
671    let last_label_is_s3 = trimmed.split('.').next_back() == Some("s3");
672
673    let valid = trimmed == "s3"
674        || trimmed.starts_with("s3.")
675        // Legacy path-style hyphenated form: `s3-<region>.amazonaws.com`.
676        // Accepts any `s3-*` prefix without validating the region name, so
677        // `s3-mybucket.amazonaws.com` is a known false-negative (passes the
678        // check but is not a real S3 endpoint; user sees a DNS error rather
679        // than this helpful message). Tightening would require a region
680        // allowlist, which is fragile as AWS adds regions.
681        || trimmed.starts_with("s3-")
682        || last_label_is_s3
683        || trimmed.contains(".s3.")
684        || trimmed.contains(".s3-");
685
686    if !valid {
687        return Err(ParseError::InvalidAwsS3Endpoint {
688            host: host.to_owned(),
689        });
690    }
691    Ok(())
692}
693
694fn finish_s3(
695    mut endpoint: Url,
696    host: &str,
697    flags: RemoteFlags,
698    addressing_override: Option<AddressingOverride>,
699) -> Result<RemoteUrl, ParseError> {
700    let segments = path_segments(&endpoint);
701
702    check_aws_s3_host(host)?;
703
704    let (addressing, bucket, prefix_segments) =
705        resolve_s3_components(host, &segments, addressing_override)?;
706
707    if !is_valid_bucket(&bucket) {
708        return Err(ParseError::InvalidBucket(bucket));
709    }
710    let prefix = join_prefix(prefix_segments);
711
712    // Re-emit a canonical path so Display round-trips cleanly.
713    let canonical: Vec<&str> = match addressing {
714        S3Addressing::VirtualHosted => prefix_segments.iter().map(String::as_str).collect(),
715        S3Addressing::PathStyle => std::iter::once(bucket.as_str())
716            .chain(prefix_segments.iter().map(String::as_str))
717            .collect(),
718    };
719    set_canonical_path(&mut endpoint, &canonical);
720
721    Ok(RemoteUrl::S3 {
722        endpoint,
723        bucket,
724        prefix,
725        addressing,
726        flags,
727    })
728}
729
730/// Determine S3 addressing style and extract the bucket name and prefix
731/// segments from the URL's host and path.
732///
733/// Path-style skips the `rfind` scan entirely; virtual-hosted (auto or
734/// explicit) runs it once and reuses the result for both detection and
735/// extraction.
736fn resolve_s3_components<'a>(
737    host: &str,
738    segments: &'a [String],
739    addressing_override: Option<AddressingOverride>,
740) -> Result<(S3Addressing, String, &'a [String]), ParseError> {
741    // Compute addressing and the AWS bucket prefix together.
742    let (addressing, aws_bucket) = match addressing_override {
743        Some(AddressingOverride::Path) => (S3Addressing::PathStyle, None),
744        Some(AddressingOverride::Virtual) => {
745            (S3Addressing::VirtualHosted, s3_virtual_hosted_bucket(host))
746        }
747        None => {
748            let b = s3_virtual_hosted_bucket(host);
749            let style = if b.is_some() {
750                S3Addressing::VirtualHosted
751            } else {
752                S3Addressing::PathStyle
753            };
754            (style, b)
755        }
756    };
757
758    let (bucket, prefix_segments) = match addressing {
759        S3Addressing::VirtualHosted => {
760            // `aws_bucket` covers both auto-detected and explicit
761            // `?addressing=virtual` for AWS hosts. Falls back to the
762            // leftmost label for non-AWS virtual-hosted endpoints, which
763            // by convention put the bucket as the leftmost label.
764            let bucket = aws_bucket
765                .or_else(|| leftmost_label(host))
766                .ok_or(ParseError::MissingBucket)?;
767            (bucket, segments)
768        }
769        S3Addressing::PathStyle => {
770            let (head, tail) = segments.split_first().ok_or(ParseError::MissingBucket)?;
771            (head.clone(), tail)
772        }
773    };
774
775    Ok((addressing, bucket, prefix_segments))
776}
777
778/// AWS virtual-hosted infixes anchored at the start of the
779/// `s3[.-]<region>.amazonaws.com` suffix. The scan picks the rightmost
780/// occurrence (see `s3_virtual_hosted_bucket`) so a bucket prefix
781/// containing dots — or even a literal `.s3.` segment — survives
782/// intact and only the AWS service marker before the region is
783/// consumed.
784pub(crate) const AWS_S3_INFIXES: &[&str] = &[".s3.", ".s3-"];
785
786/// Extract the bucket prefix that precedes the AWS `.s3.` or `.s3-`
787/// service infix in `host`. Returns `None` for hosts that don't carry
788/// the AWS virtual-hosted shape — callers fall back to `leftmost_label`
789/// for non-AWS endpoints reached via `?addressing=virtual`.
790///
791/// Uses `rfind` (rightmost occurrence) so a bucket name that itself
792/// contains `.s3.` or `.s3-` segments (no AWS rule forbids it) is
793/// extracted in full instead of being truncated at the first match.
794/// The returned string is the entire substring before the chosen
795/// infix, so dotted bucket names like `bucketname.com` survive intact.
796pub(crate) fn s3_virtual_hosted_bucket(host: &str) -> Option<String> {
797    // Both infixes are 4 bytes, so the one whose rfind position is
798    // numerically largest is the rightmost match in the string — no need
799    // to track which infix won after taking the max.
800    AWS_S3_INFIXES
801        .iter()
802        .filter_map(|infix| host.rfind(infix))
803        .max()
804        .map(|idx| host[..idx].to_owned())
805        .filter(|bucket| !bucket.is_empty())
806}
807
808fn leftmost_label(host: &str) -> Option<String> {
809    host.split('.')
810        .next()
811        .filter(|l| !l.is_empty())
812        .map(str::to_owned)
813}
814
815// ---------------------------------------------------------------------------
816// Azure
817// ---------------------------------------------------------------------------
818
819fn finish_azure(
820    mut endpoint: Url,
821    host: &str,
822    flags: RemoteFlags,
823    addressing_override: Option<AddressingOverride>,
824) -> Result<RemoteUrl, ParseError> {
825    let segments = path_segments(&endpoint);
826
827    let addressing = match addressing_override {
828        Some(AddressingOverride::Path) => AzureAddressing::PathStyle,
829        Some(AddressingOverride::Virtual) => AzureAddressing::VirtualHosted,
830        None => detect_azure_addressing(host),
831    };
832
833    let (account, container, prefix_segments) =
834        resolve_azure_components(addressing, host, &segments)?;
835
836    if !is_valid_account(&account) {
837        return Err(ParseError::InvalidAccount(account));
838    }
839    if !is_valid_container(&container) {
840        return Err(ParseError::InvalidContainer(container));
841    }
842    let prefix = join_prefix(prefix_segments);
843
844    let canonical: Vec<&str> = match addressing {
845        AzureAddressing::VirtualHosted => std::iter::once(container.as_str())
846            .chain(prefix_segments.iter().map(String::as_str))
847            .collect(),
848        AzureAddressing::PathStyle => std::iter::once(account.as_str())
849            .chain(std::iter::once(container.as_str()))
850            .chain(prefix_segments.iter().map(String::as_str))
851            .collect(),
852    };
853    set_canonical_path(&mut endpoint, &canonical);
854
855    Ok(RemoteUrl::Azure {
856        endpoint,
857        account,
858        container,
859        prefix,
860        addressing,
861        flags,
862    })
863}
864
865/// Extract the storage account, container, and prefix segments from the
866/// URL's host and path, according to the resolved addressing style.
867fn resolve_azure_components<'a>(
868    addressing: AzureAddressing,
869    host: &str,
870    segments: &'a [String],
871) -> Result<(String, String, &'a [String]), ParseError> {
872    match addressing {
873        AzureAddressing::VirtualHosted => {
874            let account = leftmost_label(host).ok_or(ParseError::MissingAccount)?;
875            match segments {
876                [] => Err(ParseError::MissingContainer),
877                [container, rest @ ..] => Ok((account, container.clone(), rest)),
878            }
879        }
880        AzureAddressing::PathStyle => match segments {
881            [] => Err(ParseError::MissingAccount),
882            [_] => Err(ParseError::MissingContainer),
883            [account, container, rest @ ..] => Ok((account.clone(), container.clone(), rest)),
884        },
885    }
886}
887
888fn detect_azure_addressing(host: &str) -> AzureAddressing {
889    // §3.4: virtual-hosted iff the second hostname label is `blob`.
890    // Hosts are already lowercased by the `url` crate (RFC 3986).
891    if host.split('.').nth(1) == Some("blob") {
892        AzureAddressing::VirtualHosted
893    } else {
894        AzureAddressing::PathStyle
895    }
896}
897
898// ---------------------------------------------------------------------------
899// Validation (§3.5)
900// ---------------------------------------------------------------------------
901
902/// AWS-reserved bucket-name prefixes. See
903/// <https://docs.aws.amazon.com/AmazonS3/latest/userguide/bucketnamingrules.html>.
904const FORBIDDEN_BUCKET_PREFIXES: &[&str] = &["xn--", "sthree-", "amzn-s3-demo-"];
905
906/// AWS-reserved bucket-name suffixes. See the same AWS doc.
907const FORBIDDEN_BUCKET_SUFFIXES: &[&str] =
908    &["-s3alias", "--ol-s3", ".mrap", "--x-s3", "--table-s3"];
909
910/// AWS S3 General Purpose bucket-naming rules: 3–63 chars, lowercase
911/// alphanumerics plus `.` and `-`, must begin and end with a letter or
912/// digit, no consecutive periods, not formatted as an IPv4 address, and
913/// none of the AWS reserved prefixes or suffixes.
914fn is_valid_bucket(s: &str) -> bool {
915    let bytes = s.as_bytes();
916    let (Some(&first), Some(&last)) = (bytes.first(), bytes.last()) else {
917        return false;
918    };
919    (3..=63).contains(&bytes.len())
920        && is_ascii_alphanum_lower(first)
921        && is_ascii_alphanum_lower(last)
922        && bytes
923            .iter()
924            .all(|b| is_ascii_alphanum_lower(*b) || matches!(*b, b'.' | b'-'))
925        && !s.contains("..")
926        && !is_ipv4_formatted(s)
927        && !FORBIDDEN_BUCKET_PREFIXES.iter().any(|p| s.starts_with(p))
928        && !FORBIDDEN_BUCKET_SUFFIXES.iter().any(|p| s.ends_with(p))
929}
930
931/// `[a-z0-9]{3,24}` — Azure storage-account naming rule.
932fn is_valid_account(s: &str) -> bool {
933    (3..=24).contains(&s.len()) && s.bytes().all(is_ascii_alphanum_lower)
934}
935
936/// Azure container-naming rule: 3–63 chars, lowercase alphanumerics plus
937/// `-`, must begin and end with a letter or digit, and no consecutive
938/// hyphens. See
939/// <https://learn.microsoft.com/en-us/rest/api/storageservices/naming-and-referencing-containers--blobs--and-metadata>.
940fn is_valid_container(s: &str) -> bool {
941    let bytes = s.as_bytes();
942    let (Some(&first), Some(&last)) = (bytes.first(), bytes.last()) else {
943        return false;
944    };
945    (3..=63).contains(&bytes.len())
946        && is_ascii_alphanum_lower(first)
947        && is_ascii_alphanum_lower(last)
948        && bytes
949            .iter()
950            .all(|b| is_ascii_alphanum_lower(*b) || *b == b'-')
951        && !s.contains("--")
952}
953
954const fn is_ascii_alphanum_lower(b: u8) -> bool {
955    b.is_ascii_lowercase() || b.is_ascii_digit()
956}
957
958/// True iff `s` looks like a dotted-quad IPv4 address (four non-empty
959/// digit-only segments separated by `.`). AWS rejects bucket names with
960/// this shape regardless of whether the address is routable.
961fn is_ipv4_formatted(s: &str) -> bool {
962    let mut parts = 0usize;
963    for part in s.split('.') {
964        parts += 1;
965        if parts > 4 {
966            return false;
967        }
968        if part.is_empty() || !part.bytes().all(|b| b.is_ascii_digit()) {
969            return false;
970        }
971    }
972    parts == 4
973}
974
975#[cfg(test)]
976mod tests {
977    use super::*;
978
979    #[test]
980    fn rejects_empty() {
981        assert_eq!(parse(""), Err(ParseError::Empty));
982        assert_eq!(parse("   "), Err(ParseError::Empty));
983    }
984
985    #[test]
986    fn rejects_unknown_scheme() {
987        let err = parse("https://example.com/bucket").unwrap_err();
988        assert!(matches!(err, ParseError::UnsupportedScheme(s) if s == "https"));
989    }
990
991    #[test]
992    fn rejects_backend_tag_with_unsupported_inner_scheme() {
993        // `detect_backend` must check both the `s3+`/`az+` tag and the
994        // inner `http(s)://` scheme — otherwise an `s3+ftp://` URL would
995        // sneak past classification and surface as a confusing downstream
996        // `Url::parse` error.
997        for input in [
998            "s3+ftp://example.com/b",
999            "az+ftp://acct.blob.core.windows.net/c",
1000        ] {
1001            let err = parse(input).unwrap_err();
1002            assert!(
1003                matches!(&err, ParseError::UnsupportedScheme(_)),
1004                "expected UnsupportedScheme for {input}, got {err:?}",
1005            );
1006        }
1007    }
1008
1009    #[test]
1010    fn validates_bucket_charset() {
1011        assert!(is_valid_bucket("my-bucket"));
1012        assert!(is_valid_bucket("a23"));
1013        assert!(is_valid_bucket("a.b.c"));
1014        assert!(!is_valid_bucket("ab"));
1015        assert!(!is_valid_bucket("-leading-dash"));
1016        assert!(!is_valid_bucket("trailing-dash-"));
1017        assert!(!is_valid_bucket(".leading-dot"));
1018        assert!(!is_valid_bucket("trailing-dot."));
1019        assert!(!is_valid_bucket("UPPER"));
1020        assert!(!is_valid_bucket(&"a".repeat(64)));
1021    }
1022
1023    #[test]
1024    fn rejects_bucket_with_consecutive_dots() {
1025        assert!(!is_valid_bucket("ab..cd"));
1026        assert!(!is_valid_bucket("a..b"));
1027    }
1028
1029    #[test]
1030    fn rejects_bucket_formatted_like_ipv4() {
1031        assert!(!is_valid_bucket("192.168.1.1"));
1032        assert!(!is_valid_bucket("1.2.3.4"));
1033        assert!(!is_valid_bucket("999.999.999.999"));
1034        // Three or five segments are not IPv4-shaped.
1035        assert!(is_valid_bucket("1.2.3"));
1036        assert!(is_valid_bucket("1.2.3.4.5"));
1037    }
1038
1039    #[test]
1040    fn rejects_forbidden_bucket_prefixes() {
1041        assert!(!is_valid_bucket("xn--abc"));
1042        assert!(!is_valid_bucket("sthree-foo"));
1043        assert!(!is_valid_bucket("amzn-s3-demo-bucket"));
1044    }
1045
1046    #[test]
1047    fn rejects_forbidden_bucket_suffixes() {
1048        assert!(!is_valid_bucket("my-bucket-s3alias"));
1049        assert!(!is_valid_bucket("my-bucket--ol-s3"));
1050        assert!(!is_valid_bucket("my-bucket--x-s3"));
1051        assert!(!is_valid_bucket("my-bucket--table-s3"));
1052        assert!(!is_valid_bucket("ab.mrap"));
1053    }
1054
1055    #[test]
1056    fn ipv4_formatted_helper() {
1057        assert!(is_ipv4_formatted("0.0.0.0"));
1058        assert!(is_ipv4_formatted("10.20.30.40"));
1059        assert!(!is_ipv4_formatted("a.b.c.d"));
1060        assert!(!is_ipv4_formatted("1.2.3"));
1061        assert!(!is_ipv4_formatted("1.2.3.4.5"));
1062        assert!(!is_ipv4_formatted("1..2.3"));
1063        assert!(!is_ipv4_formatted(".1.2.3.4"));
1064    }
1065
1066    #[test]
1067    fn validates_account_charset() {
1068        assert!(is_valid_account("myacct1"));
1069        assert!(!is_valid_account("ab"));
1070        assert!(!is_valid_account("has-hyphen"));
1071        assert!(!is_valid_account(&"a".repeat(25)));
1072    }
1073
1074    #[test]
1075    fn validates_container_charset() {
1076        assert!(is_valid_container("my-container"));
1077        assert!(is_valid_container("a-b-c"));
1078        assert!(!is_valid_container("ab"));
1079        assert!(!is_valid_container("UPPER"));
1080        assert!(!is_valid_container(&"a".repeat(64)));
1081    }
1082
1083    #[test]
1084    fn rejects_container_with_dash_at_boundary() {
1085        assert!(!is_valid_container("-leading"));
1086        assert!(!is_valid_container("trailing-"));
1087    }
1088
1089    #[test]
1090    fn rejects_container_with_consecutive_dashes() {
1091        assert!(!is_valid_container("a--b"));
1092        assert!(!is_valid_container("foo--bar"));
1093    }
1094
1095    #[test]
1096    fn s3_addressing_heuristic() {
1097        // Auto-detection is now expressed as s3_virtual_hosted_bucket.is_some().
1098        assert!(s3_virtual_hosted_bucket("my-bucket.s3.us-west-2.amazonaws.com").is_some());
1099        assert!(s3_virtual_hosted_bucket("s3.us-west-2.amazonaws.com").is_none());
1100        assert!(s3_virtual_hosted_bucket("acc.r2.cloudflarestorage.com").is_none());
1101    }
1102
1103    #[test]
1104    fn s3_addressing_heuristic_dotted_bucket() {
1105        // Bucket names with embedded dots stretch the host across more
1106        // than two labels — auto-detection must still recognise the
1107        // virtual-hosted shape.
1108        assert!(s3_virtual_hosted_bucket("bucketname.com.s3.us-west-2.amazonaws.com").is_some());
1109        assert!(s3_virtual_hosted_bucket("my.dotted.s3.us-west-2.amazonaws.com").is_some());
1110        // Legacy `s3-<region>` hyphenated form.
1111        assert!(s3_virtual_hosted_bucket("bucketname.com.s3-us-west-2.amazonaws.com").is_some());
1112    }
1113
1114    #[test]
1115    fn s3_virtual_hosted_bucket_extracts_full_prefix() {
1116        assert_eq!(
1117            s3_virtual_hosted_bucket("my-bucket.s3.us-west-2.amazonaws.com"),
1118            Some("my-bucket".to_owned())
1119        );
1120        assert_eq!(
1121            s3_virtual_hosted_bucket("bucketname.com.s3.us-west-2.amazonaws.com"),
1122            Some("bucketname.com".to_owned())
1123        );
1124        assert_eq!(
1125            s3_virtual_hosted_bucket("my.dotted.s3.us-west-2.amazonaws.com"),
1126            Some("my.dotted".to_owned())
1127        );
1128        assert_eq!(
1129            s3_virtual_hosted_bucket("bucketname.com.s3-us-west-2.amazonaws.com"),
1130            Some("bucketname.com".to_owned())
1131        );
1132        // Path-style host has no `.s3.` infix preceded by anything —
1133        // returns None so the caller falls through.
1134        assert_eq!(s3_virtual_hosted_bucket("s3.us-west-2.amazonaws.com"), None);
1135        // Non-AWS host: no infix.
1136        assert_eq!(
1137            s3_virtual_hosted_bucket("acc.r2.cloudflarestorage.com"),
1138            None
1139        );
1140        // Pathological: bucket name itself contains `.s3.`. The
1141        // rightmost infix is the AWS service marker, so the full
1142        // bucket prefix is recovered.
1143        assert_eq!(
1144            s3_virtual_hosted_bucket("my.s3.bucket.s3.us-west-2.amazonaws.com"),
1145            Some("my.s3.bucket".to_owned())
1146        );
1147    }
1148
1149    #[test]
1150    fn azure_addressing_heuristic() {
1151        assert_eq!(
1152            detect_azure_addressing("my-account.blob.core.windows.net"),
1153            AzureAddressing::VirtualHosted
1154        );
1155        assert_eq!(
1156            detect_azure_addressing("127.0.0.1"),
1157            AzureAddressing::PathStyle
1158        );
1159    }
1160
1161    #[test]
1162    fn azure_path_style_with_account_only_rejects_missing_container() {
1163        // Path-style: host/account/container/prefix. Exactly one path
1164        // segment means the container is absent — must be a parse error.
1165        let err = parse("az+https://127.0.0.1/myaccount").unwrap_err();
1166        assert!(
1167            matches!(err, ParseError::MissingContainer),
1168            "expected MissingContainer, got {err:?}",
1169        );
1170    }
1171
1172    // --- StorageEngine and ?engine= flag ---------------------------------
1173
1174    #[test]
1175    fn engine_flag_absent_leaves_none() {
1176        let url = parse("s3+https://my-bucket.s3.us-west-2.amazonaws.com/repo").unwrap();
1177        assert_eq!(url.flags().engine, None);
1178    }
1179
1180    #[test]
1181    fn engine_flag_bundle_parses() {
1182        let url =
1183            parse("s3+https://my-bucket.s3.us-west-2.amazonaws.com/repo?engine=bundle").unwrap();
1184        assert_eq!(url.flags().engine, Some(StorageEngine::Bundle));
1185    }
1186
1187    #[test]
1188    fn engine_flag_rejects_unknown_value() {
1189        let err =
1190            parse("s3+https://my-bucket.s3.us-west-2.amazonaws.com/repo?engine=pack").unwrap_err();
1191        assert!(
1192            matches!(err, ParseError::UnknownEngine(ref s) if s == "pack"),
1193            "expected UnknownEngine(pack), got {err:?}",
1194        );
1195    }
1196
1197    #[test]
1198    fn engine_flag_rejects_empty_value() {
1199        let err =
1200            parse("s3+https://my-bucket.s3.us-west-2.amazonaws.com/repo?engine=").unwrap_err();
1201        assert!(
1202            matches!(err, ParseError::UnknownEngine(ref s) if s.is_empty()),
1203            "expected UnknownEngine(\"\"), got {err:?}",
1204        );
1205    }
1206
1207    #[test]
1208    fn unknown_engine_error_message_lists_every_supported_engine() {
1209        // Iterating over `StorageEngine::ALL` keeps this regression test
1210        // synchronised with the enum: a new variant whose name is missing
1211        // from the rendered diagnostic fails this assertion.
1212        let err =
1213            parse("s3+https://my-bucket.s3.us-west-2.amazonaws.com/repo?engine=pack").unwrap_err();
1214        let rendered = err.to_string();
1215        assert!(
1216            rendered.contains("unknown engine `pack`"),
1217            "missing rejected-value in `{rendered}`",
1218        );
1219        for engine in StorageEngine::ALL {
1220            assert!(
1221                rendered.contains(&format!("`{}`", engine.as_str())),
1222                "UnknownEngine message must mention engine `{}`, got `{rendered}`",
1223                engine.as_str(),
1224            );
1225        }
1226    }
1227
1228    #[test]
1229    fn engine_as_str_roundtrips() {
1230        assert_eq!(StorageEngine::Bundle.as_str(), "bundle");
1231        assert_eq!(StorageEngine::Bundle.to_string(), "bundle");
1232        assert_eq!(StorageEngine::Packchain.as_str(), "packchain");
1233        assert_eq!(StorageEngine::Packchain.to_string(), "packchain");
1234    }
1235
1236    #[test]
1237    fn engine_from_name_parses_known_and_rejects_unknown() {
1238        assert_eq!(
1239            StorageEngine::from_name("bundle"),
1240            Some(StorageEngine::Bundle)
1241        );
1242        assert_eq!(
1243            StorageEngine::from_name("packchain"),
1244            Some(StorageEngine::Packchain)
1245        );
1246        assert_eq!(StorageEngine::from_name("pack"), None);
1247        assert_eq!(StorageEngine::from_name(""), None);
1248        assert_eq!(StorageEngine::from_name("Bundle"), None); // case-sensitive
1249        assert_eq!(StorageEngine::from_name("Packchain"), None); // case-sensitive
1250    }
1251
1252    #[test]
1253    fn engine_flag_packchain_parses() {
1254        let url =
1255            parse("s3+https://my-bucket.s3.us-west-2.amazonaws.com/repo?engine=packchain").unwrap();
1256        assert_eq!(url.flags().engine, Some(StorageEngine::Packchain));
1257    }
1258
1259    // --- bundle_uri flag (issue #71) -------------------------------------
1260
1261    #[test]
1262    fn bundle_uri_flag_absent_defaults_to_false() {
1263        let url = parse("s3+https://my-bucket.s3.us-west-2.amazonaws.com/repo").unwrap();
1264        assert!(!url.flags().bundle_uri);
1265    }
1266
1267    #[test]
1268    fn bundle_uri_flag_one_sets_true() {
1269        let url = parse(
1270            "s3+https://my-bucket.s3.us-west-2.amazonaws.com/repo?engine=packchain&bundle_uri=1",
1271        )
1272        .unwrap();
1273        assert!(url.flags().bundle_uri);
1274    }
1275
1276    #[test]
1277    fn bundle_uri_flag_zero_sets_false() {
1278        let url = parse(
1279            "s3+https://my-bucket.s3.us-west-2.amazonaws.com/repo?engine=packchain&bundle_uri=0",
1280        )
1281        .unwrap();
1282        assert!(!url.flags().bundle_uri);
1283    }
1284
1285    // --- bundle_uri_presign_ttl flag (issue #76) -------------------------
1286
1287    #[test]
1288    fn bundle_uri_presign_ttl_absent_defaults_to_none() {
1289        let url = parse(
1290            "s3+https://my-bucket.s3.us-west-2.amazonaws.com/repo?engine=packchain&bundle_uri=1",
1291        )
1292        .unwrap();
1293        assert_eq!(url.flags().bundle_uri_presign_ttl, None);
1294    }
1295
1296    #[test]
1297    fn bundle_uri_presign_ttl_positive_int_parses() {
1298        let url = parse(
1299            "s3+https://my-bucket.s3.us-west-2.amazonaws.com/repo\
1300             ?engine=packchain&bundle_uri=1&bundle_uri_presign_ttl=3600",
1301        )
1302        .unwrap();
1303        assert_eq!(
1304            url.flags().bundle_uri_presign_ttl,
1305            Some(NonZeroU64::new(3600).expect("3600 is non-zero")),
1306        );
1307    }
1308
1309    #[test]
1310    fn bundle_uri_presign_ttl_one_second_accepted() {
1311        // Useless in practice but the type-system contract is "any
1312        // positive value"; operator's prerogative to choose.
1313        let url = parse(
1314            "s3+https://my-bucket.s3.us-west-2.amazonaws.com/repo\
1315             ?engine=packchain&bundle_uri=1&bundle_uri_presign_ttl=1",
1316        )
1317        .unwrap();
1318        assert_eq!(
1319            url.flags().bundle_uri_presign_ttl,
1320            Some(NonZeroU64::new(1).expect("1 is non-zero")),
1321        );
1322    }
1323
1324    #[test]
1325    fn bundle_uri_presign_ttl_zero_rejected() {
1326        // Zero-second TTL is meaningless; reject at the boundary
1327        // rather than letting the bad value flow into the
1328        // (presigning) backend.
1329        let err = parse(
1330            "s3+https://my-bucket.s3.us-west-2.amazonaws.com/repo\
1331             ?engine=packchain&bundle_uri=1&bundle_uri_presign_ttl=0",
1332        )
1333        .unwrap_err();
1334        assert!(
1335            matches!(
1336                err,
1337                ParseError::InvalidFlagValue { ref name, ref value }
1338                    if name == "bundle_uri_presign_ttl" && value == "0"
1339            ),
1340            "expected InvalidFlagValue {{ name: bundle_uri_presign_ttl, value: 0 }}, got {err:?}",
1341        );
1342    }
1343
1344    #[test]
1345    fn bundle_uri_presign_ttl_non_numeric_rejected() {
1346        let err = parse(
1347            "s3+https://my-bucket.s3.us-west-2.amazonaws.com/repo\
1348             ?engine=packchain&bundle_uri=1&bundle_uri_presign_ttl=abc",
1349        )
1350        .unwrap_err();
1351        assert!(
1352            matches!(
1353                err,
1354                ParseError::InvalidFlagValue { ref name, ref value }
1355                    if name == "bundle_uri_presign_ttl" && value == "abc"
1356            ),
1357            "expected InvalidFlagValue, got {err:?}",
1358        );
1359    }
1360
1361    #[test]
1362    fn bundle_uri_presign_ttl_negative_rejected() {
1363        // u64 parser rejects negative input; surface as InvalidFlagValue.
1364        let err = parse(
1365            "s3+https://my-bucket.s3.us-west-2.amazonaws.com/repo\
1366             ?engine=packchain&bundle_uri=1&bundle_uri_presign_ttl=-1",
1367        )
1368        .unwrap_err();
1369        assert!(
1370            matches!(err, ParseError::InvalidFlagValue { ref name, .. } if name == "bundle_uri_presign_ttl"),
1371            "expected InvalidFlagValue, got {err:?}",
1372        );
1373    }
1374
1375    /// Issue #219: huge values panic the Azure SAS builder via
1376    /// `time::Duration::seconds_f64`. The URL boundary caps the flag
1377    /// at [`MAX_BUNDLE_URI_PRESIGN_TTL_SECONDS`] (7 days) so the bad
1378    /// value never reaches the helper, matching the AWS SDK's hard
1379    /// ceiling.
1380    #[test]
1381    fn bundle_uri_presign_ttl_above_seven_days_rejected() {
1382        let err = parse(
1383            "s3+https://my-bucket.s3.us-west-2.amazonaws.com/repo\
1384             ?engine=packchain&bundle_uri=1&bundle_uri_presign_ttl=604801",
1385        )
1386        .unwrap_err();
1387        assert!(
1388            matches!(
1389                err,
1390                ParseError::BundleUriPresignTtlTooLarge { value, max }
1391                    if value == 604_801 && max == MAX_BUNDLE_URI_PRESIGN_TTL_SECONDS
1392            ),
1393            "expected BundleUriPresignTtlTooLarge {{ value: 604801, max: {MAX_BUNDLE_URI_PRESIGN_TTL_SECONDS} }}, got {err:?}",
1394        );
1395    }
1396
1397    /// Issue #219: the pathological `u64::MAX`-class value reported
1398    /// in the bug must be rejected at the URL boundary with a clean
1399    /// error rather than panicking the helper.
1400    #[test]
1401    fn bundle_uri_presign_ttl_huge_value_rejected_not_panic() {
1402        let err = parse(
1403            "s3+https://my-bucket.s3.us-west-2.amazonaws.com/repo\
1404             ?engine=packchain&bundle_uri=1&bundle_uri_presign_ttl=999999999999999999",
1405        )
1406        .unwrap_err();
1407        assert!(
1408            matches!(
1409                err,
1410                ParseError::BundleUriPresignTtlTooLarge { value, .. }
1411                    if value == 999_999_999_999_999_999
1412            ),
1413            "expected BundleUriPresignTtlTooLarge for huge value, got {err:?}",
1414        );
1415    }
1416
1417    /// Issue #219: the 7-day boundary value itself is accepted so
1418    /// operators can express AWS's spec-mandated maximum.
1419    #[test]
1420    fn bundle_uri_presign_ttl_exactly_seven_days_accepted() {
1421        let url = parse(
1422            "s3+https://my-bucket.s3.us-west-2.amazonaws.com/repo\
1423             ?engine=packchain&bundle_uri=1&bundle_uri_presign_ttl=604800",
1424        )
1425        .unwrap();
1426        assert_eq!(
1427            url.flags().bundle_uri_presign_ttl,
1428            Some(
1429                NonZeroU64::new(MAX_BUNDLE_URI_PRESIGN_TTL_SECONDS).expect("7-day cap is non-zero")
1430            ),
1431        );
1432    }
1433
1434    #[test]
1435    fn engine_flag_packchain_on_azure_url() {
1436        let url =
1437            parse("az+https://myaccount.blob.core.windows.net/my-container/repo?engine=packchain")
1438                .unwrap();
1439        assert_eq!(url.flags().engine, Some(StorageEngine::Packchain));
1440    }
1441
1442    #[test]
1443    fn engine_flag_on_azure_url() {
1444        let url =
1445            parse("az+https://myaccount.blob.core.windows.net/my-container/repo?engine=bundle")
1446                .unwrap();
1447        assert_eq!(url.flags().engine, Some(StorageEngine::Bundle));
1448    }
1449
1450    // --- AWS S3 endpoint host validation ------------------------------------
1451
1452    #[test]
1453    fn rejects_amazonaws_host_missing_s3_service_marker() {
1454        // The common mistake: <bucket>.<region>.amazonaws.com — no `.s3.`.
1455        let err = parse("s3+https://git-test-2224.us-west-2.amazonaws.com/git-remote-object-store")
1456            .unwrap_err();
1457        assert!(
1458            matches!(err, ParseError::InvalidAwsS3Endpoint { ref host } if host == "git-test-2224.us-west-2.amazonaws.com"),
1459            "expected InvalidAwsS3Endpoint, got {err:?}",
1460        );
1461    }
1462
1463    #[test]
1464    fn accepts_valid_aws_s3_hosts() {
1465        // Virtual-hosted with region.
1466        parse("s3+https://my-bucket.s3.us-west-2.amazonaws.com/repo").unwrap();
1467        // Virtual-hosted without region (legacy global).
1468        parse("s3+https://my-bucket.s3.amazonaws.com/repo").unwrap();
1469        // Virtual-hosted legacy hyphenated region.
1470        parse("s3+https://my-bucket.s3-us-west-2.amazonaws.com/repo").unwrap();
1471        // Path-style with region.
1472        parse("s3+https://s3.us-west-2.amazonaws.com/my-bucket/repo").unwrap();
1473        // Path-style without region (legacy global).
1474        parse("s3+https://s3.amazonaws.com/my-bucket/repo").unwrap();
1475        // Path-style legacy hyphenated region (`s3-<region>.amazonaws.com`).
1476        parse("s3+https://s3-us-east-1.amazonaws.com/my-bucket/repo").unwrap();
1477        // China partition (`.amazonaws.com.cn`): both addressing styles.
1478        parse("s3+https://my-bucket.s3.cn-north-1.amazonaws.com.cn/repo").unwrap();
1479        parse("s3+https://s3.cn-north-1.amazonaws.com.cn/my-bucket/repo").unwrap();
1480    }
1481
1482    #[test]
1483    fn rejects_china_amazonaws_host_missing_s3_service_marker() {
1484        // Same typo class as `rejects_amazonaws_host_missing_s3_service_marker`
1485        // but on the China partition (`.amazonaws.com.cn`). The typo
1486        // `<bucket>.<region>.amazonaws.com.cn` (no `.s3.` marker) must
1487        // produce the helpful `InvalidAwsS3Endpoint`, not a silent fall-
1488        // through to PathStyle and a DNS error at connect time.
1489        let err = parse("s3+https://git-test.cn-north-1.amazonaws.com.cn/repo").unwrap_err();
1490        assert!(
1491            matches!(err, ParseError::InvalidAwsS3Endpoint { ref host } if host == "git-test.cn-north-1.amazonaws.com.cn"),
1492            "expected InvalidAwsS3Endpoint, got {err:?}",
1493        );
1494    }
1495
1496    #[test]
1497    fn check_aws_s3_host_runs_before_addressing_override() {
1498        // Policy: `?addressing=path` (or `=virtual`) on an AWS hostname
1499        // does NOT bypass the validator. AWS owns `.amazonaws.com[.cn]`,
1500        // so any host on those suffixes that is not a recognised S3
1501        // endpoint is a typo. A user who needs path-style addressing on a
1502        // vanity host should use a domain they own, not `.amazonaws.com`.
1503        let err =
1504            parse("s3+https://corp.amazonaws.com/my-bucket/repo?addressing=path").unwrap_err();
1505        assert!(
1506            matches!(err, ParseError::InvalidAwsS3Endpoint { ref host } if host == "corp.amazonaws.com"),
1507            "expected InvalidAwsS3Endpoint, got {err:?}",
1508        );
1509        let err =
1510            parse("s3+https://corp.amazonaws.com/my-bucket/repo?addressing=virtual").unwrap_err();
1511        assert!(
1512            matches!(err, ParseError::InvalidAwsS3Endpoint { ref host } if host == "corp.amazonaws.com"),
1513            "expected InvalidAwsS3Endpoint, got {err:?}",
1514        );
1515    }
1516
1517    #[test]
1518    fn accepts_s3_prefix_known_false_negative() {
1519        // `s3-<non-region>.amazonaws.com` passes `check_aws_s3_host` because
1520        // the `starts_with("s3-")` guard does not validate the region name.
1521        // Pinned here to document the known false-negative: the parse
1522        // succeeds, but the user will see a DNS error at connect time rather
1523        // than the helpful `InvalidAwsS3Endpoint` message. The valid legacy
1524        // form (`s3-us-east-1`) and this false-negative are accepted by the
1525        // same branch; a tightening that rejects false-negatives must not
1526        // break valid legacy inputs.
1527        parse("s3+https://s3-mybucket.amazonaws.com/my-bucket/repo").unwrap();
1528    }
1529
1530    #[test]
1531    fn accepts_non_aws_s3_compatible_hosts() {
1532        // MinIO, Cloudflare R2, and other S3-compatible services that do
1533        // not use `.amazonaws.com` are not subject to the service-marker check.
1534        parse("s3+https://play.min.io/my-bucket/repo").unwrap();
1535        parse("s3+https://acc.r2.cloudflarestorage.com/my-bucket/repo").unwrap();
1536        parse("s3+https://localhost/my-bucket/repo?zip=0").unwrap();
1537    }
1538
1539    // --- Boolean-value vocabulary (issue #187) ----------------------------
1540    //
1541    // The same `parse_bool_value` helper governs both URL query flags
1542    // (`?zip=`, `?bundle_uri=`) and env-var booleans (`ALLOW_HTTP`).
1543    // The matrix below pins the accepted set so the two surfaces stay
1544    // in sync.
1545
1546    #[test]
1547    fn parse_bool_value_accepts_truthy_tokens() {
1548        for v in ["1", "true", "yes", "on"] {
1549            assert_eq!(parse_bool_value(v), Some(true), "expected true for `{v}`");
1550        }
1551    }
1552
1553    #[test]
1554    fn parse_bool_value_accepts_falsy_tokens() {
1555        for v in ["0", "false", "no", "off"] {
1556            assert_eq!(parse_bool_value(v), Some(false), "expected false for `{v}`");
1557        }
1558    }
1559
1560    #[test]
1561    fn parse_bool_value_is_case_insensitive() {
1562        // Per-value matrix covering common mixed-case spellings users
1563        // type ad-hoc. The helper must accept every casing for every
1564        // accepted token; this loop checks the full cross product.
1565        for (input, expected) in [
1566            ("TRUE", true),
1567            ("True", true),
1568            ("tRuE", true),
1569            ("YES", true),
1570            ("Yes", true),
1571            ("ON", true),
1572            ("On", true),
1573            ("FALSE", false),
1574            ("False", false),
1575            ("NO", false),
1576            ("No", false),
1577            ("OFF", false),
1578            ("Off", false),
1579        ] {
1580            assert_eq!(
1581                parse_bool_value(input),
1582                Some(expected),
1583                "expected {expected} for `{input}`",
1584            );
1585        }
1586    }
1587
1588    #[test]
1589    fn parse_bool_value_rejects_unknown_tokens() {
1590        // Empty string, near-misses, common typos, and arbitrary
1591        // junk must all fall through to `None` so the URL-flag path
1592        // can surface `InvalidFlagValue` and the env-var path can
1593        // fall back to "unset". Picking "y"/"n" as rejected pins the
1594        // policy: short forms are NOT accepted (issue #187 left this
1595        // explicit to avoid surprising aliases).
1596        for v in [
1597            "", " ", "yep", "nope", "2", "-1", "truee", "y", "n", "enabled",
1598        ] {
1599            assert_eq!(parse_bool_value(v), None, "expected None for `{v}`");
1600        }
1601    }
1602
1603    #[test]
1604    fn parse_bool_flag_propagates_invalid_flag_value_error() {
1605        // Names propagated into the error must match the flag the
1606        // user typed so the diagnostic stays useful.
1607        let err = parse_bool_flag("zip", "maybe").unwrap_err();
1608        assert!(
1609            matches!(&err, ParseError::InvalidFlagValue { name, value }
1610                if name == "zip" && value == "maybe"),
1611            "expected InvalidFlagValue(zip, maybe), got {err:?}",
1612        );
1613    }
1614
1615    #[test]
1616    fn url_bool_flags_accept_mixed_case_and_extended_vocabulary() {
1617        // Per-value coverage at the URL surface: `?zip=` and
1618        // `?bundle_uri=` must accept every truthy / falsy token the
1619        // helper recognises. Loopback host keeps this independent of
1620        // the AWS-endpoint validator.
1621        for v in ["1", "true", "True", "TRUE", "yes", "Yes", "on", "ON"] {
1622            let url = parse(&format!("s3+https://localhost/my-bucket/repo?zip={v}")).unwrap();
1623            assert!(url.flags().zip, "expected zip=true for `{v}`");
1624        }
1625        for v in ["0", "false", "False", "FALSE", "no", "No", "off", "OFF"] {
1626            let url = parse(&format!("s3+https://localhost/my-bucket/repo?zip={v}")).unwrap();
1627            assert!(!url.flags().zip, "expected zip=false for `{v}`");
1628        }
1629    }
1630
1631    #[test]
1632    fn url_bool_flags_reject_unknown_value_with_flag_name() {
1633        let err = parse("s3+https://localhost/my-bucket/repo?zip=maybe").unwrap_err();
1634        assert!(
1635            matches!(&err, ParseError::InvalidFlagValue { name, value }
1636                if name == "zip" && value == "maybe"),
1637            "expected InvalidFlagValue(zip, maybe), got {err:?}",
1638        );
1639
1640        let err = parse("s3+https://localhost/my-bucket/repo?bundle_uri=2").unwrap_err();
1641        assert!(
1642            matches!(&err, ParseError::InvalidFlagValue { name, value }
1643                if name == "bundle_uri" && value == "2"),
1644            "expected InvalidFlagValue(bundle_uri, 2), got {err:?}",
1645        );
1646    }
1647}