Skip to main content

zlayer_secrets/
worker_ca.rs

1//! Worker-CA support for `ZLayer`'s worker-tier mTLS.
2//!
3//! When a cluster operates in `worker-tier` mode, control-plane nodes maintain
4//! an X.509 Certificate Authority used to issue short-lived mTLS leaf certs to
5//! worker nodes during their `Register` RPC. Workers generate an EC P-256
6//! keypair locally, build a PKCS#10 CSR, and submit it; the leader verifies
7//! the bootstrap token, signs the CSR with this CA, and returns the cert +
8//! the CA chain.
9//!
10//! Workers then mutually-authenticate every subsequent gRPC connection using
11//! the leaf cert; the control plane validates against the same CA's root.
12//!
13//! # On-disk layout
14//!
15//! Two files under the cluster's data directory:
16//!
17//! - `<base_dir>/worker_ca.crt` — CA cert in PEM (mode 0644, world-readable
18//!   because it's the root of the public chain).
19//! - `<base_dir>/worker_ca.key` — CA private key in PEM PKCS#8 (mode 0600).
20//!
21//! Generation is one-shot: if `worker_ca.crt`/`worker_ca.key` already exist on
22//! disk, they're loaded; otherwise a fresh P-256 self-signed CA is generated
23//! and persisted atomically (`*.tmp` → `rename`).
24
25use std::path::{Path, PathBuf};
26
27use rcgen::{
28    BasicConstraints, Certificate, CertificateParams, CertificateSigningRequestParams,
29    DistinguishedName, DnType, ExtendedKeyUsagePurpose, IsCa, KeyPair, KeyUsagePurpose,
30    PKCS_ECDSA_P256_SHA256,
31};
32use rustls_pki_types::CertificateSigningRequestDer;
33use time::{Duration, OffsetDateTime};
34use tracing::{debug, info};
35
36use crate::{Result, SecretsError};
37
38/// File name of the CA certificate (PEM, mode 0644).
39pub const WORKER_CA_CERT_FILE: &str = "worker_ca.crt";
40/// File name of the CA private key (PEM PKCS#8, mode 0600).
41pub const WORKER_CA_KEY_FILE: &str = "worker_ca.key";
42
43/// Default leaf-cert validity (90 days). Workers must re-register before this
44/// expires; the control plane should rotate well in advance.
45pub const DEFAULT_LEAF_VALIDITY_DAYS: i64 = 90;
46
47/// Default CA-cert validity (10 years). The CA is long-lived; rotation is
48/// a separate (manual, future) op.
49pub const DEFAULT_CA_VALIDITY_YEARS: i64 = 10;
50
51/// Worker certificate authority.
52///
53/// Holds the CA keypair + cert in memory after load/generate. Persisted to
54/// disk so the CA identity survives daemon restarts.
55pub struct WorkerCa {
56    cert: Certificate,
57    key_pair: KeyPair,
58    base_dir: PathBuf,
59}
60
61impl std::fmt::Debug for WorkerCa {
62    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
63        f.debug_struct("WorkerCa")
64            .field("base_dir", &self.base_dir)
65            .field("subject", &self.cert.params().distinguished_name)
66            .finish_non_exhaustive()
67    }
68}
69
70impl WorkerCa {
71    /// Load the worker CA from `base_dir`, generating one if absent.
72    ///
73    /// # Errors
74    ///
75    /// Returns [`SecretsError::Storage`] on I/O failure and
76    /// [`SecretsError::Encryption`] on malformed on-disk PEM or `rcgen`
77    /// errors.
78    pub fn load_or_generate(base_dir: impl AsRef<Path>) -> Result<Self> {
79        let base_dir = base_dir.as_ref().to_path_buf();
80        std::fs::create_dir_all(&base_dir)
81            .map_err(|e| SecretsError::Storage(format!("create worker CA dir: {e}")))?;
82
83        let cert_path = base_dir.join(WORKER_CA_CERT_FILE);
84        let key_path = base_dir.join(WORKER_CA_KEY_FILE);
85
86        if cert_path.exists() && key_path.exists() {
87            return Self::load_from_pem(&cert_path, &key_path, base_dir);
88        }
89
90        Self::generate_and_persist(base_dir)
91    }
92
93    fn load_from_pem(cert_path: &Path, key_path: &Path, base_dir: PathBuf) -> Result<Self> {
94        let cert_pem = std::fs::read_to_string(cert_path).map_err(|e| {
95            SecretsError::Storage(format!("read worker CA cert {}: {e}", cert_path.display()))
96        })?;
97        let key_pem = std::fs::read_to_string(key_path).map_err(|e| {
98            SecretsError::Storage(format!("read worker CA key {}: {e}", key_path.display()))
99        })?;
100
101        let key_pair = KeyPair::from_pem(&key_pem)
102            .map_err(|e| SecretsError::Encryption(format!("parse worker CA key PEM: {e}")))?;
103
104        let params = CertificateParams::from_ca_cert_pem(&cert_pem)
105            .map_err(|e| SecretsError::Encryption(format!("parse worker CA cert PEM: {e}")))?;
106        let cert = params
107            .self_signed(&key_pair)
108            .map_err(|e| SecretsError::Encryption(format!("re-bind CA cert: {e}")))?;
109
110        debug!("Loaded existing worker CA from {}", base_dir.display());
111        Ok(Self {
112            cert,
113            key_pair,
114            base_dir,
115        })
116    }
117
118    fn generate_and_persist(base_dir: PathBuf) -> Result<Self> {
119        let mut params = CertificateParams::default();
120        let mut dn = DistinguishedName::new();
121        dn.push(DnType::CommonName, "ZLayer Worker CA");
122        dn.push(DnType::OrganizationName, "ZLayer");
123        params.distinguished_name = dn;
124        params.is_ca = IsCa::Ca(BasicConstraints::Unconstrained);
125        params.key_usages = vec![
126            KeyUsagePurpose::KeyCertSign,
127            KeyUsagePurpose::CrlSign,
128            KeyUsagePurpose::DigitalSignature,
129        ];
130
131        let now = OffsetDateTime::now_utc();
132        params.not_before = now - Duration::minutes(1);
133        params.not_after = now + Duration::days(DEFAULT_CA_VALIDITY_YEARS * 365);
134
135        let key_pair = KeyPair::generate_for(&PKCS_ECDSA_P256_SHA256)
136            .map_err(|e| SecretsError::Encryption(format!("generate worker CA keypair: {e}")))?;
137        let cert = params
138            .self_signed(&key_pair)
139            .map_err(|e| SecretsError::Encryption(format!("self-sign worker CA cert: {e}")))?;
140
141        let cert_pem = cert.pem();
142        let key_pem = key_pair.serialize_pem();
143
144        let cert_path = base_dir.join(WORKER_CA_CERT_FILE);
145        let key_path = base_dir.join(WORKER_CA_KEY_FILE);
146
147        atomic_write(&cert_path, cert_pem.as_bytes(), 0o644)?;
148        atomic_write(&key_path, key_pem.as_bytes(), 0o600)?;
149
150        info!(
151            "Generated new worker CA at {} (valid {} years)",
152            base_dir.display(),
153            DEFAULT_CA_VALIDITY_YEARS
154        );
155
156        Ok(Self {
157            cert,
158            key_pair,
159            base_dir,
160        })
161    }
162
163    /// Return the CA certificate in DER encoding for inclusion in the gRPC
164    /// `RegisterResponse.ca_chain_der`.
165    #[must_use]
166    pub fn ca_cert_der(&self) -> Vec<u8> {
167        self.cert.der().to_vec()
168    }
169
170    /// Return the CA certificate in PEM (for human readers / debug).
171    #[must_use]
172    pub fn ca_cert_pem(&self) -> String {
173        self.cert.pem()
174    }
175
176    /// Sign a worker-submitted CSR. Returns the leaf cert in DER.
177    ///
178    /// # Errors
179    ///
180    /// Returns [`SecretsError::Encryption`] if the CSR is malformed, uses an
181    /// unsupported key type, or signing fails.
182    pub fn sign_csr_der(
183        &self,
184        csr_der: &[u8],
185        common_name: &str,
186        validity: Duration,
187    ) -> Result<Vec<u8>> {
188        // Convert raw DER bytes to the typed `CertificateSigningRequestDer`
189        // that rcgen 0.13 expects. The borrowed-slice `From` impl avoids
190        // copying the bytes.
191        let csr_typed = CertificateSigningRequestDer::from(csr_der);
192
193        let mut params = CertificateSigningRequestParams::from_der(&csr_typed)
194            .map_err(|e| SecretsError::Encryption(format!("parse CSR: {e}")))?;
195
196        // Override the subject CN to a leader-controlled value so a malicious
197        // worker can't pick its own identity. Everything else (key, SANs) comes
198        // from the CSR as submitted.
199        let mut dn = DistinguishedName::new();
200        dn.push(DnType::CommonName, common_name);
201        dn.push(DnType::OrganizationName, "ZLayer Worker");
202        params.params.distinguished_name = dn;
203
204        let now = OffsetDateTime::now_utc();
205        params.params.not_before = now - Duration::minutes(1);
206        params.params.not_after = now + validity;
207        params.params.key_usages = vec![
208            KeyUsagePurpose::DigitalSignature,
209            KeyUsagePurpose::KeyEncipherment,
210        ];
211        params.params.extended_key_usages = vec![
212            ExtendedKeyUsagePurpose::ClientAuth,
213            ExtendedKeyUsagePurpose::ServerAuth,
214        ];
215
216        let cert = params
217            .signed_by(&self.cert, &self.key_pair)
218            .map_err(|e| SecretsError::Encryption(format!("sign CSR: {e}")))?;
219
220        Ok(cert.der().to_vec())
221    }
222
223    /// Base directory where the CA files live.
224    #[must_use]
225    pub fn base_dir(&self) -> &Path {
226        &self.base_dir
227    }
228}
229
230fn atomic_write(path: &Path, data: &[u8], mode: u32) -> Result<()> {
231    let tmp = path.with_extension(format!(
232        "{}.tmp",
233        path.extension().and_then(|s| s.to_str()).unwrap_or("")
234    ));
235    std::fs::write(&tmp, data)
236        .map_err(|e| SecretsError::Storage(format!("write tmp {}: {e}", tmp.display())))?;
237    #[cfg(unix)]
238    {
239        use std::os::unix::fs::PermissionsExt;
240        let perms = std::fs::Permissions::from_mode(mode);
241        std::fs::set_permissions(&tmp, perms)
242            .map_err(|e| SecretsError::Storage(format!("chmod {}: {e}", tmp.display())))?;
243    }
244    #[cfg(not(unix))]
245    {
246        let _ = mode;
247    }
248    std::fs::rename(&tmp, path).map_err(|e| {
249        SecretsError::Storage(format!(
250            "rename {} -> {}: {e}",
251            tmp.display(),
252            path.display()
253        ))
254    })?;
255    Ok(())
256}
257
258#[cfg(test)]
259mod tests {
260    use super::*;
261    use tempfile::TempDir;
262
263    #[test]
264    fn load_or_generate_persists_and_reloads() {
265        let dir = TempDir::new().expect("tempdir");
266        let ca1 = WorkerCa::load_or_generate(dir.path()).expect("generate");
267        let der1 = ca1.ca_cert_der();
268        drop(ca1);
269
270        let ca2 = WorkerCa::load_or_generate(dir.path()).expect("reload");
271        let der2 = ca2.ca_cert_der();
272
273        // ECDSA signatures are non-deterministic — re-signing on reload
274        // produces a fresh signature even with the same key. The cert body
275        // (everything up to the signature) and the subject public-key info
276        // must match.
277        let (_, cert1) = x509_parser::parse_x509_certificate(&der1).expect("parse cert1");
278        let (_, cert2) = x509_parser::parse_x509_certificate(&der2).expect("parse cert2");
279        assert_eq!(
280            cert1.tbs_certificate.subject_pki.subject_public_key.data,
281            cert2.tbs_certificate.subject_pki.subject_public_key.data,
282            "reload must yield same CA public key"
283        );
284        assert_eq!(
285            cert1.tbs_certificate.subject.to_string(),
286            cert2.tbs_certificate.subject.to_string(),
287            "reload must yield same CA subject"
288        );
289    }
290
291    #[test]
292    fn sign_csr_round_trip() {
293        let dir = TempDir::new().expect("tempdir");
294        let ca = WorkerCa::load_or_generate(dir.path()).expect("ca");
295
296        // Worker side: generate a P-256 keypair + CSR.
297        let worker_kp = KeyPair::generate_for(&PKCS_ECDSA_P256_SHA256).expect("kp");
298        let mut csr_params = CertificateParams::default();
299        let mut dn = DistinguishedName::new();
300        dn.push(DnType::CommonName, "doesnt-matter-leader-overrides");
301        csr_params.distinguished_name = dn;
302        let csr = csr_params
303            .serialize_request(&worker_kp)
304            .expect("serialize CSR");
305        let csr_der = csr.der().to_vec();
306
307        // Leader signs.
308        let leaf_der = ca
309            .sign_csr_der(&csr_der, "worker-7", Duration::days(7))
310            .expect("sign");
311
312        assert!(!leaf_der.is_empty());
313        // x509-parser sanity check: the signed cert has our CA as issuer.
314        let (_, parsed) = x509_parser::parse_x509_certificate(&leaf_der).expect("parse leaf");
315        let issuer_cn = parsed
316            .issuer()
317            .iter_common_name()
318            .next()
319            .and_then(|cn| cn.as_str().ok())
320            .unwrap_or("");
321        assert_eq!(issuer_cn, "ZLayer Worker CA");
322
323        let subject_cn = parsed
324            .subject()
325            .iter_common_name()
326            .next()
327            .and_then(|cn| cn.as_str().ok())
328            .unwrap_or("");
329        assert_eq!(subject_cn, "worker-7");
330    }
331}