Skip to main content

zlayer_agent/
auth.rs

1//! Per-container credential minting.
2//!
3//! Every container can be handed a JWT so its workload can call the daemon API
4//! back without external credentials. Historically that token was an unscoped,
5//! 365-day, cluster-wide reader (`roles: ["container"]`) AND the host admin
6//! Unix socket was bind-mounted into every container — so a container was, in
7//! effect, daemon-admin. Both holes are closed here:
8//!
9//! - The token is now a **scoped** access token (`scopes: [deployment:<own>:read]`
10//!   by default), bounded by a short TTL, carrying no privileged role.
11//! - The admin socket is **not** mounted by default; a deployment must opt in
12//!   explicitly via the [`LABEL_DAEMON_SOCKET`] label (documented as full-admin).
13//!
14//! All three knobs are overridable per-service via labels so a CI-runner
15//! service can request broader scopes / a longer TTL / the socket when it
16//! genuinely needs them.
17
18use std::collections::HashMap;
19use std::time::{Duration, SystemTime, UNIX_EPOCH};
20
21use async_trait::async_trait;
22use jsonwebtoken::{encode, EncodingKey, Header};
23use zlayer_types::jwt::Claims;
24use zlayer_types::storage::{PermissionLevel, StoredAccessToken, TokenScope};
25
26/// Sink the runtime uses to persist + revoke per-container scoped access
27/// tokens. Implemented in the bin over the daemon's `AccessTokenStorage`
28/// (which is SecretsStore-backed, so it replicates cluster-wide).
29#[async_trait]
30pub trait ContainerTokenSink: Send + Sync + std::fmt::Debug {
31    /// Persist a freshly-minted container token record (so its `jti` is
32    /// accepted by the auth layer's fail-closed revocation check). Returns
33    /// `true` when the record was persisted (the token is now revocable and its
34    /// `jti` may safely be embedded); `false` on failure (the caller must mint
35    /// without a `jti` so the fail-closed check doesn't reject it).
36    async fn persist(&self, record: StoredAccessToken) -> bool;
37    /// Revoke a container token by its `jti` (called on teardown).
38    async fn revoke(&self, jti: &str);
39}
40
41/// Spawns and tears down a per-container Docker Engine API socket.
42///
43/// Implemented in the `zlayer` bin (which can see both `zlayer-agent` and the
44/// higher-level `zlayer-docker` crate — the agent itself must NOT depend on
45/// `zlayer-docker`, that would be a dependency cycle). The runtime calls
46/// [`spawn`](Self::spawn) when a container opts into `zlayer.io/docker-socket`
47/// and [`teardown`](Self::teardown) on stop/remove.
48#[async_trait]
49pub trait DockerSocketSpawner: Send + Sync + std::fmt::Debug {
50    /// Provision a per-container Docker Engine API socket authenticated as the
51    /// container's scoped `token`, and return the HOST path of the socket to
52    /// bind-mount into the container at `/var/run/docker.sock`. The path is
53    /// derived deterministically from `container_key`, so the server task may
54    /// still be binding asynchronously when this returns (it connects to the
55    /// daemon first, then binds — near-instant since the daemon is already up).
56    /// Returns `None` on failure (the container still starts, just without a
57    /// docker socket). Idempotent per `container_key`.
58    async fn spawn(&self, container_key: &str, token: String) -> Option<String>;
59    /// Abort the per-container socket server task and unlink its socket file.
60    /// Safe to call for an unknown key (no-op).
61    async fn teardown(&self, container_key: &str);
62}
63
64/// Sink the runtime uses to persist a service's most-recently-resolved image
65/// digest into the deployment store, so a daemon restart can recreate the
66/// service from the already-local image (by digest) with zero remote/S3 traffic.
67///
68/// Implemented in the `zlayer` bin over the daemon's `DeploymentStorage`
69/// (`SqlxStorage` in `ZLayer`, the ZQL store in `ZLayerZQL`). The agent itself
70/// must NOT depend on the storage layer — that would be a dependency cycle — so
71/// the concrete write lives in the bin and is threaded in as this trait object.
72///
73/// `record` is best-effort: a failed store update is logged by the implementor
74/// and MUST NOT fail the deploy/scale that triggered the pull. Breaking the
75/// boot-time circular dependency (`ZLayer` using its own `ZataStorage` S3
76/// backend as the blob cache) is the whole point — see
77/// [`crate::service::ServiceInstance::set_restore_pin`].
78#[async_trait]
79pub trait DeploymentDigestSink: Send + Sync + std::fmt::Debug {
80    /// Persist `digest` (e.g. `"sha256:abc…"`) as the resolved image digest for
81    /// `service` within `deployment`, keyed by service name in
82    /// `StoredDeployment.resolved_image_digests`. Best-effort; never panics.
83    async fn record(&self, deployment: &str, service: &str, digest: &str);
84}
85
86/// Label opting a service's containers into having the host daemon Unix socket
87/// bind-mounted (Docker-in-Docker / container-management workloads). **This
88/// grants the container full daemon admin** via the socket auto-auth path, so
89/// it is off by default and must be set deliberately.
90pub const LABEL_DAEMON_SOCKET: &str = "zlayer.io/daemon-socket";
91
92/// Label overriding the container token's TTL, in seconds. Defaults to
93/// [`DEFAULT_CONTAINER_TOKEN_TTL_SECS`].
94pub const LABEL_API_TOKEN_TTL: &str = "zlayer.io/api-token-ttl";
95
96/// Label overriding the container token's scopes. Comma-separated
97/// `kind:id:level` (or `kind:level` for a wildcard id); `*` id also means
98/// wildcard. Replaces the default `deployment:<own>:read` scope entirely.
99pub const LABEL_API_SCOPES: &str = "zlayer.io/api-scopes";
100
101/// Default container-token lifetime: 24h. Short relative to the old 365-day
102/// token — bounds a leaked token's window — while comfortably covering a
103/// container's startup-time API calls. Long-running workloads that call the API
104/// past this can raise it via [`LABEL_API_TOKEN_TTL`].
105pub const DEFAULT_CONTAINER_TOKEN_TTL_SECS: u64 = 24 * 60 * 60;
106
107/// The resolved API-access policy for a container, derived from its deployment
108/// + service labels.
109#[derive(Debug, Clone)]
110pub struct ContainerApiAccess {
111    /// Scopes baked into the minted token.
112    pub scopes: Vec<TokenScope>,
113    /// Token lifetime.
114    pub ttl: Duration,
115    /// Whether to bind-mount the host admin Unix socket (full-admin opt-in).
116    pub mount_socket: bool,
117}
118
119/// Parse a single `kind:id:level` / `kind:level` scope string into a
120/// [`TokenScope`]. An `id` of `*` (or the 2-segment form) is a wildcard. Returns
121/// `None` for a malformed scope (caller logs + skips).
122fn parse_scope(s: &str) -> Option<TokenScope> {
123    let parts: Vec<&str> = s.split(':').collect();
124    let (kind, id, level) = match parts.as_slice() {
125        [kind, level] => (*kind, None, *level),
126        [kind, id, level] => (
127            *kind,
128            if *id == "*" || id.is_empty() {
129                None
130            } else {
131                Some((*id).to_string())
132            },
133            *level,
134        ),
135        _ => return None,
136    };
137    if kind.is_empty() {
138        return None;
139    }
140    let level = match level.to_ascii_lowercase().as_str() {
141        "none" => PermissionLevel::None,
142        "read" => PermissionLevel::Read,
143        "execute" => PermissionLevel::Execute,
144        "write" => PermissionLevel::Write,
145        _ => return None,
146    };
147    Some(TokenScope::new(kind, id, level))
148}
149
150/// Resolve a container's API-access policy from its deployment name and service
151/// labels. Default: read + `container:*:write` + `image:*:write` on its own
152/// deployment (so the default per-container Docker socket can drive
153/// build/run/push), [`DEFAULT_CONTAINER_TOKEN_TTL_SECS`] TTL, no admin socket.
154/// Labels override each knob.
155#[must_use]
156pub fn resolve_container_api_access<S: std::hash::BuildHasher>(
157    deployment: &str,
158    labels: &HashMap<String, String, S>,
159) -> ContainerApiAccess {
160    let scopes = match labels.get(LABEL_API_SCOPES) {
161        Some(raw) if !raw.trim().is_empty() => {
162            let parsed: Vec<TokenScope> = raw
163                .split(',')
164                .filter_map(|s| parse_scope(s.trim()))
165                .collect();
166            if parsed.is_empty() {
167                default_scopes_for(deployment)
168            } else {
169                parsed
170            }
171        }
172        _ => default_scopes_for(deployment),
173    };
174
175    let ttl = labels
176        .get(LABEL_API_TOKEN_TTL)
177        .and_then(|v| v.trim().parse::<u64>().ok())
178        .filter(|secs| *secs > 0)
179        .unwrap_or(DEFAULT_CONTAINER_TOKEN_TTL_SECS);
180
181    let mount_socket = labels
182        .get(LABEL_DAEMON_SOCKET)
183        .is_some_and(|v| matches!(v.trim().to_ascii_lowercase().as_str(), "1" | "true" | "yes"));
184
185    ContainerApiAccess {
186        scopes,
187        ttl: Duration::from_secs(ttl),
188        mount_socket,
189    }
190}
191
192/// Default scopes for a container. Every container gets a per-container Docker
193/// Engine API socket by default, and `docker` needs to create containers and
194/// pull/build/push images, so grant `container:*:write` + `image:*:write` (the
195/// kinds the Docker-compat endpoints authorize against) in addition to the
196/// read-only own-deployment default. A deployment can replace these wholesale
197/// via [`LABEL_API_SCOPES`].
198#[must_use]
199pub fn default_scopes_for(deployment: &str) -> Vec<TokenScope> {
200    let mut scopes = default_container_scopes(deployment);
201    scopes.push(TokenScope::new("container", None, PermissionLevel::Write));
202    scopes.push(TokenScope::new("image", None, PermissionLevel::Write));
203    scopes
204}
205
206/// The default least-privilege container scope: read-only on its own deployment.
207#[must_use]
208pub fn default_container_scopes(deployment: &str) -> Vec<TokenScope> {
209    vec![TokenScope::new(
210        "deployment",
211        Some(deployment.to_string()),
212        PermissionLevel::Read,
213    )]
214}
215
216/// Mint a scoped JWT for a container.
217///
218/// The token carries `scopes` (and the marker role `container`, which grants
219/// nothing on its own — authority comes from the scopes) and the supplied
220/// `jti`. When `jti` is `Some`, a matching [`StoredAccessToken`] record MUST
221/// already be persisted (the auth layer is fail-closed and rejects a `jti` with
222/// no record); pass `None` to mint an un-revocable token bounded only by the
223/// short TTL. It is signed with the daemon's JWT secret so the API accepts it.
224///
225/// # Errors
226///
227/// Returns an error string if the system clock is unavailable or JWT encoding fails.
228pub fn mint_container_token(
229    secret: &str,
230    service_name: &str,
231    container_id: &str,
232    scopes: Vec<TokenScope>,
233    ttl: Duration,
234    jti: Option<String>,
235) -> Result<String, String> {
236    let now = SystemTime::now()
237        .duration_since(UNIX_EPOCH)
238        .map_err(|e| e.to_string())?;
239    let claims = Claims {
240        sub: format!("container:{service_name}:{container_id}"),
241        iat: now.as_secs(),
242        exp: (now + ttl).as_secs(),
243        iss: "zlayer".to_string(),
244        roles: vec!["container".to_string()],
245        email: None,
246        node_id: None,
247        scopes,
248        jti,
249    };
250    encode(
251        &Header::default(),
252        &claims,
253        &EncodingKey::from_secret(secret.as_bytes()),
254    )
255    .map_err(|e| e.to_string())
256}
257
258#[cfg(test)]
259mod tests {
260    use super::*;
261
262    #[test]
263    fn default_access_includes_docker_scopes() {
264        let access = resolve_container_api_access("myapp", &HashMap::new());
265        // Read-only own-deployment base.
266        assert!(access.scopes.iter().any(|s| s.resource_kind == "deployment"
267            && s.resource_id.as_deref() == Some("myapp")
268            && s.level == PermissionLevel::Read));
269        // Default per-container Docker socket needs container + image write.
270        assert!(access.scopes.iter().any(|s| s.resource_kind == "container"
271            && s.resource_id.is_none()
272            && s.level == PermissionLevel::Write));
273        assert!(access.scopes.iter().any(|s| s.resource_kind == "image"
274            && s.resource_id.is_none()
275            && s.level == PermissionLevel::Write));
276        assert_eq!(
277            access.ttl,
278            Duration::from_secs(DEFAULT_CONTAINER_TOKEN_TTL_SECS)
279        );
280        assert!(!access.mount_socket);
281    }
282
283    #[test]
284    fn explicit_scopes_replace_defaults() {
285        let mut labels = HashMap::new();
286        labels.insert(
287            LABEL_API_SCOPES.to_string(),
288            "deployment:foo:read".to_string(),
289        );
290        let access = resolve_container_api_access("foo", &labels);
291        // Explicit scopes replace the defaults wholesale — no widening.
292        assert_eq!(access.scopes.len(), 1);
293        assert_eq!(access.scopes[0].resource_kind, "deployment");
294    }
295
296    #[test]
297    fn labels_override_scopes_ttl_and_socket() {
298        let mut labels = HashMap::new();
299        labels.insert(
300            LABEL_API_SCOPES.to_string(),
301            "deployment:foo:write, job:build:execute, container:*:read".to_string(),
302        );
303        labels.insert(LABEL_API_TOKEN_TTL.to_string(), "3600".to_string());
304        labels.insert(LABEL_DAEMON_SOCKET.to_string(), "true".to_string());
305        let access = resolve_container_api_access("foo", &labels);
306        assert_eq!(access.scopes.len(), 3);
307        assert_eq!(access.scopes[0].level, PermissionLevel::Write);
308        assert_eq!(access.scopes[1].resource_kind, "job");
309        assert_eq!(access.scopes[1].resource_id.as_deref(), Some("build"));
310        assert_eq!(access.scopes[2].resource_id, None); // wildcard
311        assert_eq!(access.ttl, Duration::from_secs(3600));
312        assert!(access.mount_socket);
313    }
314
315    #[test]
316    fn malformed_scopes_fall_back_to_default() {
317        let mut labels = HashMap::new();
318        labels.insert(LABEL_API_SCOPES.to_string(), "garbage::::".to_string());
319        let access = resolve_container_api_access("d", &labels);
320        assert_eq!(access.scopes, default_scopes_for("d"));
321    }
322
323    #[test]
324    fn invalid_ttl_falls_back_to_default() {
325        let mut labels = HashMap::new();
326        labels.insert(LABEL_API_TOKEN_TTL.to_string(), "notanumber".to_string());
327        let access = resolve_container_api_access("d", &labels);
328        assert_eq!(
329            access.ttl,
330            Duration::from_secs(DEFAULT_CONTAINER_TOKEN_TTL_SECS)
331        );
332    }
333}