Skip to main content

koi_certmesh/
core_renewal.rs

1//! Member renewal, trust-bundle pull, health, role, and promotion.
2//!
3//! Part of the inherent impl CertmeshCore, split from lib.rs (certmesh M2).
4//! As a child module of the crate root, 'use super::*' inherits lib.rs's
5//! imports, sibling modules, and crate-private state/helpers as in the original.
6use super::*;
7
8impl CertmeshCore {
9    // ── Phase 3 - Lifecycle ────────────────────────────────────────
10
11    /// Member-initiated, rotate-key renewal (ADR-017 F6).
12    ///
13    /// A no-op ([`RenewOutcome::NotApplicable`]) unless this node has a persisted
14    /// [`member::MemberState`] (i.e. it *joined* a mesh). When its local leaf is
15    /// within the CA policy's `renew_threshold_days`, it:
16    ///
17    /// 1. generates a **fresh** keypair + CSR (rotate-on-renewal — the new private
18    ///    key is held in memory until the install succeeds, never on the CA),
19    /// 2. POSTs only the CSR to the CA's mTLS `/v1/certmesh/renew`, presenting its
20    ///    **current** (still-valid) leaf as the client identity,
21    /// 3. verifies the returned CA fingerprint matches its pin (anti-CA-swap),
22    /// 4. installs the new key + signed leaf locally and runs its reload hook.
23    ///
24    /// The CA never generates or receives a member private key — on enroll *or*
25    /// renew. If the network call fails (CA down, cert lapsed past mTLS validity)
26    /// the local files are left untouched and the loop retries next tick.
27    ///
28    /// Emits `CertRenewed`, `CertRenewalFailed`, and `CertExpiringSoon` lifecycle events.
29    pub async fn renew_self_if_due(&self) -> Result<RenewOutcome, CertmeshError> {
30        // Inner function carries all the real work; this outer shell handles event
31        // emission for every failure exit without scattering it across every `?`.
32        let days_left_at_attempt = self.cert_days_left_if_member();
33        let result = self.renew_self_if_due_inner().await;
34        match &result {
35            Err(e) => {
36                let count = self
37                    .state
38                    .renewal_failure_count
39                    .fetch_add(1, std::sync::atomic::Ordering::Relaxed)
40                    + 1;
41                let _ = self.state.event_tx.send(CertmeshEvent::CertRenewalFailed {
42                    reason: e.to_string(),
43                    consecutive_failures: count,
44                });
45                // Only emit CertExpiringSoon when the cert is actually past the renewal
46                // threshold (i.e. we attempted renewal, not just "not due").
47                if let Some(days) = days_left_at_attempt {
48                    let _ = self
49                        .state
50                        .event_tx
51                        .send(CertmeshEvent::CertExpiringSoon { days_left: days });
52                }
53            }
54            Ok(RenewOutcome::Renewed { ref expires, .. }) => {
55                self.state
56                    .renewal_failure_count
57                    .store(0, std::sync::atomic::Ordering::Relaxed);
58                let expires_at = expires
59                    .parse::<chrono::DateTime<chrono::Utc>>()
60                    .unwrap_or_else(|_| chrono::Utc::now() + chrono::Duration::days(90));
61                let _ = self
62                    .state
63                    .event_tx
64                    .send(CertmeshEvent::CertRenewed { expires_at });
65            }
66            _ => {}
67        }
68        result
69    }
70
71    /// How many days until the local member cert expires. Returns `None` when the
72    /// node is not a member or the cert cannot be parsed. Used to populate
73    /// `CertExpiringSoon` without re-reading the cert inside the inner function.
74    fn cert_days_left_if_member(&self) -> Option<i64> {
75        let state = member::load(&self.state.paths.member_state_path())?;
76        let cert_path = self
77            .state
78            .paths
79            .certs_dir()
80            .join(&state.hostname)
81            .join("cert.pem");
82        let pem = std::fs::read_to_string(cert_path).ok()?;
83        let not_after = leaf_not_after_utc(&pem)?;
84        Some((not_after - chrono::Utc::now()).num_days())
85    }
86
87    async fn renew_self_if_due_inner(&self) -> Result<RenewOutcome, CertmeshError> {
88        let Some(state) = member::load(&self.state.paths.member_state_path()) else {
89            return Ok(RenewOutcome::NotApplicable);
90        };
91
92        let cert_dir = self.state.paths.certs_dir().join(&state.hostname);
93        // Read the current key + cert + pinned CA off the blocking pool.
94        let read_dir = cert_dir.clone();
95        let (current_cert, current_key, pinned_ca_pem) =
96            tokio::task::spawn_blocking(move || -> std::io::Result<(String, String, String)> {
97                Ok((
98                    std::fs::read_to_string(read_dir.join("cert.pem"))?,
99                    std::fs::read_to_string(read_dir.join("key.pem"))?,
100                    std::fs::read_to_string(read_dir.join("ca.pem"))?,
101                ))
102            })
103            .await
104            .map_err(|e| CertmeshError::Internal(format!("read member cert task: {e}")))??;
105
106        // Due? Compare the local leaf's not_after against the renew threshold.
107        let not_after = leaf_not_after_utc(&current_cert).ok_or_else(|| {
108            CertmeshError::Internal("cannot parse local leaf expiry for renewal".into())
109        })?;
110        let threshold = chrono::Duration::days(i64::from(state.policy.renew_threshold_days));
111        if chrono::Utc::now() + threshold < not_after {
112            return Ok(RenewOutcome::NotDue { not_after });
113        }
114
115        // Rotate: fresh keypair + CSR. The new key lives only in memory until the
116        // CA-signed leaf is in hand, so a failed renewal never discards the
117        // working key.
118        let (new_key_pem, csr_pem) = csr::generate_keypair_and_csr(&state.hostname, &state.sans)?;
119        let req_body = serde_json::to_string(&protocol::RenewRequest {
120            hostname: state.hostname.clone(),
121            csr: csr_pem,
122        })
123        .map_err(|e| CertmeshError::Internal(format!("serialize renew request: {e}")))?;
124
125        let (host, port) = state.ca_mtls_authority();
126        // Bound the network call: a black-holed CA must not stall the loop (or
127        // daemon shutdown) for the OS TCP timeout.
128        let (status, body) = tokio::time::timeout(
129            RENEWAL_REQUEST_TIMEOUT,
130            mtls::post_json(
131                &host,
132                port,
133                http::paths::RENEW,
134                &req_body,
135                &current_cert,
136                &current_key,
137                &pinned_ca_pem,
138            ),
139        )
140        .await
141        .map_err(|_| CertmeshError::RenewalFailed {
142            hostname: state.hostname.clone(),
143            reason: format!(
144                "renewal request to {host}:{port} timed out after {}s",
145                RENEWAL_REQUEST_TIMEOUT.as_secs()
146            ),
147        })??;
148
149        if status != 200 {
150            return Err(CertmeshError::RenewalFailed {
151                hostname: state.hostname.clone(),
152                reason: format!("CA returned HTTP {status}: {body}"),
153            });
154        }
155        let resp: protocol::RenewResponse =
156            serde_json::from_str(&body).map_err(|e| CertmeshError::RenewalFailed {
157                hostname: state.hostname.clone(),
158                reason: format!("malformed renew response: {e}"),
159            })?;
160
161        // Anti-CA-swap: derive the fingerprint from the RETURNED CA cert (the one
162        // we are about to install as our new pin) and require it to match the pin.
163        // Deriving locally — rather than trusting the asserted `ca_fingerprint`
164        // string — is the authoritative check.
165        let returned_ca_fp = pem::parse(&resp.ca_cert)
166            .map(|der| koi_crypto::pinning::fingerprint_sha256(der.contents()))
167            .map_err(|e| CertmeshError::RenewalFailed {
168                hostname: state.hostname.clone(),
169                reason: format!("returned ca_cert is not valid PEM: {e}"),
170            })?;
171        if !koi_crypto::pinning::fingerprints_match(&returned_ca_fp, &state.ca_fingerprint) {
172            return Err(CertmeshError::RenewalFailed {
173                hostname: state.hostname.clone(),
174                reason: "returned CA cert does not match the pinned CA fingerprint".into(),
175            });
176        }
177
178        // Install the new key + leaf atomically (temp → rename per file).
179        let new_cert = resp.service_cert.clone();
180        let new_ca = resp.ca_cert.clone();
181        let fullchain = format!("{new_cert}{new_ca}");
182        let dir = cert_dir.clone();
183        tokio::task::spawn_blocking(move || -> Result<(), CertmeshError> {
184            std::fs::create_dir_all(&dir)?;
185            write_file_atomic(&dir.join("key.pem"), new_key_pem.as_bytes(), true)?;
186            write_file_atomic(&dir.join("cert.pem"), new_cert.as_bytes(), false)?;
187            write_file_atomic(&dir.join("ca.pem"), new_ca.as_bytes(), false)?;
188            write_file_atomic(&dir.join("fullchain.pem"), fullchain.as_bytes(), false)?;
189            Ok(())
190        })
191        .await
192        .map_err(|e| CertmeshError::Internal(format!("write renewed cert task: {e}")))??;
193
194        tracing::info!(hostname = %state.hostname, expires = %resp.expires, "Member certificate renewed (rotated key)");
195
196        // Run the local reload hook, if configured.
197        let hook = state
198            .reload_hook
199            .as_deref()
200            .map(lifecycle::execute_reload_hook);
201
202        Ok(RenewOutcome::Renewed {
203            expires: resp.expires,
204            hook,
205        })
206    }
207
208    /// Pull, verify, and apply the CA's signed trust bundle (ADR-017 P1/F4).
209    ///
210    /// A no-op ([`BundleOutcome::NotApplicable`]) unless this node joined a mesh.
211    /// Fetches the self-verifying bundle over plain HTTP, verifies the ES256
212    /// signature against the **pinned** CA fingerprint, and rejects a strictly
213    /// older `seq` (anti-rollback). On a newer bundle it refreshes the member's
214    /// cached `policy` and `last_bundle_seq`, and flags whether this node has been
215    /// revoked mesh-wide.
216    pub async fn pull_trust_bundle(&self) -> Result<BundleOutcome, CertmeshError> {
217        let member_path = self.state.paths.member_state_path();
218        let Some(mut state) = member::load(&member_path) else {
219            return Ok(BundleOutcome::NotApplicable);
220        };
221
222        let (host, port) = (state.ca_host.clone(), state.ca_http_port);
223        let (status, body) = tokio::time::timeout(
224            RENEWAL_REQUEST_TIMEOUT,
225            mtls::get(&host, port, http::paths::TRUST_BUNDLE),
226        )
227        .await
228        .map_err(|_| {
229            CertmeshError::Internal(format!("trust-bundle pull from {host}:{port} timed out"))
230        })??;
231
232        if status != 200 {
233            return Err(CertmeshError::Internal(format!(
234                "CA returned HTTP {status} for trust-bundle"
235            )));
236        }
237        let signed: bundle::SignedBundle = serde_json::from_str(&body)
238            .map_err(|e| CertmeshError::Internal(format!("malformed trust bundle: {e}")))?;
239
240        // Verify signature against the pinned CA + anti-rollback floor.
241        if let Err(e) = bundle::verify(&signed, &state.ca_fingerprint, Some(state.last_bundle_seq))
242        {
243            // F5 fail-safe: a bundle whose CA fingerprint differs from our pin is
244            // rejected, and we KEEP the old pin. There is no supported live CA
245            // re-key path today, so a fingerprint change is treated as hostile; an
246            // intentional CA replacement is recovered by re-enrolling with a fresh
247            // invite (which carries the new fingerprint, F3).
248            if matches!(e, bundle::BundleError::PinMismatch) {
249                tracing::error!(
250                    host = %state.hostname,
251                    "Trust bundle CA fingerprint does NOT match the pinned CA — rejecting \
252                     (fail-safe). Re-enroll with a fresh invite if the CA was intentionally replaced."
253                );
254            }
255            return Err(CertmeshError::Internal(format!(
256                "trust bundle rejected: {e}"
257            )));
258        }
259
260        // F5 anchor self-heal: the bundle's `ca_cert_pem` provably hashes to our pin
261        // (verify enforced it), so writing it keeps the on-disk `ca.pem` — the trust
262        // root the mTLS renewal client loads — in sync and repairs drift/corruption.
263        // Done on every verified pull (even an unchanged seq) so a wiped anchor is
264        // restored promptly; the write is skipped when the file already matches.
265        {
266            let anchor = self
267                .state
268                .paths
269                .certs_dir()
270                .join(&state.hostname)
271                .join("ca.pem");
272            let want = signed.bundle.ca_cert_pem.clone();
273            // Best-effort: the closure logs its own write error, and any JoinError
274            // (task panic) is intentionally dropped. A write failure is harmless —
275            // the bundle was already pin-verified, and because this heal runs on
276            // every verified pull (before the seq short-circuit below) the next pull
277            // simply retries it.
278            let _ = tokio::task::spawn_blocking(move || {
279                let current = std::fs::read_to_string(&anchor).ok();
280                if current.as_deref() != Some(want.as_str()) {
281                    match write_file_atomic(&anchor, want.as_bytes(), false) {
282                        Ok(()) => tracing::info!(
283                            path = %anchor.display(),
284                            "Refreshed on-disk CA anchor from the verified trust bundle"
285                        ),
286                        Err(e) => tracing::warn!(error = %e, "Could not refresh on-disk CA anchor"),
287                    }
288                }
289            })
290            .await;
291        }
292
293        let seq = signed.bundle.seq;
294        if seq == state.last_bundle_seq {
295            return Ok(BundleOutcome::NoChange { seq });
296        }
297
298        let hostname = state.hostname.clone();
299        let self_revoked = signed.bundle.is_revoked(&hostname);
300        state.last_bundle_seq = seq;
301        state.policy = signed.bundle.policy.clone();
302        tokio::task::spawn_blocking(move || member::save(&member_path, &state))
303            .await
304            .map_err(|e| CertmeshError::Internal(format!("member state save task: {e}")))??;
305
306        if self_revoked {
307            tracing::error!(
308                %hostname,
309                "This node has been REVOKED in the mesh trust bundle (seq {seq}); renewal will be refused by the CA"
310            );
311        } else {
312            tracing::debug!(seq, "Trust bundle updated");
313        }
314        let _ = self
315            .state
316            .event_tx
317            .send(CertmeshEvent::BundleUpdated { self_revoked });
318        Ok(BundleOutcome::Updated { seq, self_revoked })
319    }
320
321    /// Validate a member's health heartbeat.
322    pub async fn health_check(
323        &self,
324        request: &protocol::HealthRequest,
325    ) -> Result<protocol::HealthResponse, CertmeshError> {
326        let ca_guard = self.state.ca.lock().await;
327        let ca = ca_guard.as_ref().ok_or_else(|| {
328            if self.state.paths.is_ca_initialized() {
329                CertmeshError::CaLocked
330            } else {
331                CertmeshError::CaNotInitialized
332            }
333        })?;
334
335        let current_fp = ca::ca_fingerprint(ca);
336        let valid =
337            health::validate_pinned_fingerprint(&current_fp, &request.pinned_ca_fingerprint);
338        drop(ca_guard); // release the CA lock before the roster commit (no lock held across disk I/O)
339
340        // Touch last_seen (no seq bump — liveness is not in the bundle); reject a
341        // revoked member at the boundary before recording the heartbeat.
342        self.state
343            .touch_roster(|roster| {
344                if roster.is_revoked(&request.hostname) {
345                    return Err(CertmeshError::Revoked(request.hostname.clone()));
346                }
347                roster.touch_member(&request.hostname);
348                Ok(())
349            })
350            .await?;
351
352        Ok(protocol::HealthResponse {
353            valid,
354            ca_fingerprint: current_fp,
355        })
356    }
357
358    /// Get the current node's roster role (if any).
359    ///
360    /// Returns `None` if the roster has no entry matching the local hostname.
361    pub async fn node_role(&self) -> Option<roster::MemberRole> {
362        let hostname = hostname::get()
363            .map(|h| h.to_string_lossy().to_string())
364            .ok()?;
365        let roster = self.state.roster.lock().await;
366        roster.find_member(&hostname).map(|m| m.role.clone())
367    }
368
369    /// Promote the local member to primary and demote any existing primary.
370    /// Returns true if the roster was updated.
371    pub async fn promote_self_to_primary(&self) -> Result<bool, CertmeshError> {
372        let hostname = hostname::get()
373            .map(|h| h.to_string_lossy().to_string())
374            .map_err(|_| CertmeshError::Internal("hostname unavailable".to_string()))?;
375
376        // Role changes are not bundle content → touch_roster (no seq bump), but the
377        // write still serializes behind the single writer (F8).
378        self.state
379            .touch_roster(|roster| {
380                let already_primary = roster
381                    .find_member(&hostname)
382                    .map(|m| m.role == roster::MemberRole::Primary)
383                    .ok_or_else(|| CertmeshError::NotFound(hostname.clone()))?;
384                if already_primary {
385                    return Ok(false);
386                }
387                for m in roster.members.iter_mut() {
388                    if m.role == roster::MemberRole::Primary {
389                        m.role = roster::MemberRole::Standby;
390                    }
391                }
392                if let Some(member) = roster.find_member_mut(&hostname) {
393                    member.role = roster::MemberRole::Primary;
394                } else {
395                    return Err(CertmeshError::NotFound(hostname.clone()));
396                }
397                Ok(true)
398            })
399            .await
400    }
401
402    /// Demote the local member to standby. Returns true if the roster changed.
403    pub async fn demote_self_to_standby(&self) -> Result<bool, CertmeshError> {
404        let hostname = hostname::get()
405            .map(|h| h.to_string_lossy().to_string())
406            .map_err(|_| CertmeshError::Internal("hostname unavailable".to_string()))?;
407
408        self.state
409            .touch_roster(|roster| {
410                let member = roster
411                    .find_member_mut(&hostname)
412                    .ok_or_else(|| CertmeshError::NotFound(hostname.clone()))?;
413                if member.role == roster::MemberRole::Standby {
414                    return Ok(false);
415                }
416                member.role = roster::MemberRole::Standby;
417                Ok(true)
418            })
419            .await
420    }
421
422    /// Add alias SANs to a member's roster entry (used by DNS alias feedback).
423    ///
424    /// Returns true if any SANs were added.
425    pub async fn add_alias_sans(
426        &self,
427        hostname: &str,
428        sans: &[String],
429    ) -> Result<bool, CertmeshError> {
430        self.state
431            .touch_roster(|roster| {
432                let member = roster
433                    .find_member_mut(hostname)
434                    .ok_or_else(|| CertmeshError::NotFound(hostname.to_string()))?;
435                let mut changed = false;
436                for san in sans {
437                    if !member.cert_sans.iter().any(|s| s == san) {
438                        member.cert_sans.push(san.clone());
439                        changed = true;
440                    }
441                }
442                Ok(changed)
443            })
444            .await
445    }
446
447    /// Get the local hostname.
448    pub fn local_hostname() -> Option<String> {
449        hostname::get()
450            .map(|h| h.to_string_lossy().to_string())
451            .ok()
452    }
453
454    /// Get the pinned CA fingerprint for the local node (if set).
455    pub async fn pinned_ca_fingerprint(&self) -> Option<String> {
456        let hostname = hostname::get()
457            .map(|h| h.to_string_lossy().to_string())
458            .ok()?;
459        let roster = self.state.roster.lock().await;
460        roster
461            .find_member(&hostname)
462            .and_then(|m| m.pinned_ca_fingerprint.clone())
463    }
464
465    /// Prepare promotion material for a standby.
466    ///
467    /// Called on the primary when a standby requests promotion.
468    /// Uses DH key agreement to encrypt the CA key for wire transfer.
469    pub async fn promote(
470        &self,
471        client_public_key: &[u8; 32],
472    ) -> Result<protocol::PromoteResponse, CertmeshError> {
473        let ca_guard = self.state.ca.lock().await;
474        let ca = ca_guard.as_ref().ok_or_else(|| {
475            if self.state.paths.is_ca_initialized() {
476                CertmeshError::CaLocked
477            } else {
478                CertmeshError::CaNotInitialized
479            }
480        })?;
481
482        let auth_guard = self.state.auth.lock().await;
483        let auth_state = auth_guard.as_ref().ok_or(CertmeshError::CaLocked)?;
484
485        let roster = self.state.roster.lock().await;
486        failover::prepare_promotion(ca, auth_state, &roster, client_public_key)
487    }
488}