Skip to main content

mockforge_platform_signing/
rotation.rs

1//! Dual-control rotation state machine + on-the-wire rotation event.
2//!
3//! See RFC §9 for the procedure this implements. The operator-facing
4//! runbook is in `docs/plugins/security/platform-signing-rotation-runbook.md`.
5//!
6//! # Phases
7//!
8//! ```text
9//!     Active        ── begin_handover ──▶    Transitioning
10//!     (cur)                                  (cur + next trusted)
11//!                                                   │
12//!                                                   │ retire_old
13//!                                                   ▼
14//!                                              Active(next)
15//! ```
16//!
17//! The state machine does not talk to AWS directly when retiring the
18//! old key (operators do that via the runbook + `aws kms disable-key`)
19//! — it just gates the on-the-wire event so plugin-hosts only see
20//! state changes that match the documented sequence.
21//!
22//! # Wire format
23//!
24//! [`RotationEvent`] is what the registry publishes; plugin-hosts pick
25//! it up (poll or push) and pass it to
26//! [`crate::verifier::verify_rotation_event`]. It contains:
27//!
28//!   - the **from** key id + DER public key (the current trust anchor)
29//!   - the **to**   key id + DER public key (the new trust anchor)
30//!   - a `transition_until` timestamp (both keys are trusted until this)
31//!   - a signature over the canonical JCS payload, produced by the
32//!     **from** key — this is the cryptographic handover that proves
33//!     the new key was authorized by the predecessor.
34//!
35//! Domain prefix: `mockforge-platform-rotation/v1\n` is prepended before
36//! signing, mirroring the prefix discipline in
37//! `mockforge-plugin-host::signing` (cross-protocol replay defense).
38
39use base64::Engine;
40use chrono::{DateTime, Duration, Utc};
41use serde::{Deserialize, Serialize};
42use thiserror::Error;
43use tokio::sync::Mutex;
44
45use crate::signer::{PlatformSigner, SignerError, SigningAlgorithm};
46
47/// Domain-separation prefix for the rotation-event signed bytes.
48///
49/// Same discipline as `mockforge-plugin-host::signing` — prevents a
50/// signature over any other JSON document with a matching prefix from
51/// being replayed as a platform rotation event.
52pub const ROTATION_DOMAIN_PREFIX: &[u8] = b"mockforge-platform-rotation/v1\n";
53
54/// Default transition window. Matches RFC §9 ("≥ 30 days").
55pub const DEFAULT_TRANSITION_DAYS: i64 = 30;
56
57/// Where the rotation state machine currently sits.
58#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
59#[serde(rename_all = "kebab-case")]
60pub enum RotationPhase {
61    /// Single active key. Steady state.
62    Active,
63    /// Both old and new keys are trusted. Hosts accept signatures from
64    /// either. Lasts until `transition_until` passes.
65    Transitioning,
66}
67
68/// Inner payload of a [`RotationEvent`] — the bytes that get signed.
69///
70/// Serialized via [`serde_jcs`] (RFC 8785 canonical JSON) so the byte
71/// representation is stable across hosts. Any drift in field order or
72/// number encoding silently invalidates the signature, so canonical
73/// JSON is non-negotiable.
74#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
75#[serde(deny_unknown_fields, rename_all = "camelCase")]
76pub struct RotationEventPayload {
77    /// Schema version. Always `1` for this crate.
78    pub version: u32,
79    /// Algorithm of the **from** key — what signed this payload.
80    pub from_algorithm: SigningAlgorithm,
81    /// Opaque id of the previous key (e.g. KMS ARN).
82    pub from_key_id: String,
83    /// `SubjectPublicKeyInfo` (DER) of the previous key, base64-encoded.
84    pub from_public_key_b64: String,
85    /// Algorithm of the **to** key.
86    pub to_algorithm: SigningAlgorithm,
87    /// Opaque id of the new key.
88    pub to_key_id: String,
89    /// `SubjectPublicKeyInfo` (DER) of the new key, base64-encoded.
90    pub to_public_key_b64: String,
91    /// UTC instant at which the transition window opened.
92    pub issued_at: DateTime<Utc>,
93    /// UTC instant after which the previous key should no longer be
94    /// trusted. Plugin-hosts MUST evict the `from` key from their trust
95    /// cache once their wall clock passes this.
96    pub transition_until: DateTime<Utc>,
97}
98
99/// On-the-wire rotation event — published by the registry, consumed by
100/// every plugin-host.
101#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
102#[serde(deny_unknown_fields, rename_all = "camelCase")]
103pub struct RotationEvent {
104    /// The signed payload.
105    pub payload: RotationEventPayload,
106    /// DER-encoded ECDSA signature over
107    /// `ROTATION_DOMAIN_PREFIX || serde_jcs(payload)`, base64-encoded.
108    /// Signed by `payload.from_key_id`.
109    pub handover_signature_b64: String,
110}
111
112impl RotationEvent {
113    /// Canonical bytes that were signed to produce
114    /// `handover_signature_b64`. Used by the verifier; exposed so tests
115    /// and the audit log can show the operator exactly what was signed.
116    pub fn signed_bytes(payload: &RotationEventPayload) -> Result<Vec<u8>, RotationError> {
117        let canonical = serde_jcs::to_vec(payload)
118            .map_err(|e| RotationError::Encoding(format!("serde_jcs failed: {e}")))?;
119        let mut out = Vec::with_capacity(ROTATION_DOMAIN_PREFIX.len() + canonical.len());
120        out.extend_from_slice(ROTATION_DOMAIN_PREFIX);
121        out.extend_from_slice(&canonical);
122        Ok(out)
123    }
124}
125
126/// Drives the rotation procedure end-to-end.
127///
128/// One state machine corresponds to one platform deployment. Hold this
129/// behind an `Arc<Mutex<_>>` if multiple operators can drive it
130/// concurrently — the type itself is `!Sync` so the compiler enforces
131/// serialized access through the mutex.
132pub struct RotationStateMachine<S: PlatformSigner> {
133    current: S,
134    inner: Mutex<RotationInner>,
135}
136
137#[derive(Debug)]
138struct RotationInner {
139    phase: RotationPhase,
140    last_event: Option<RotationEvent>,
141}
142
143impl<S: PlatformSigner> RotationStateMachine<S> {
144    /// Build a fresh state machine seeded with the active key. Phase is
145    /// [`RotationPhase::Active`].
146    pub fn new(current: S) -> Self {
147        Self {
148            current,
149            inner: Mutex::new(RotationInner {
150                phase: RotationPhase::Active,
151                last_event: None,
152            }),
153        }
154    }
155
156    /// Current phase.
157    pub async fn phase(&self) -> RotationPhase {
158        self.inner.lock().await.phase
159    }
160
161    /// Most recent rotation event published, if any.
162    pub async fn last_event(&self) -> Option<RotationEvent> {
163        self.inner.lock().await.last_event.clone()
164    }
165
166    /// Step 1 of the runbook (after the operator has generated the new
167    /// KMS key out-of-band). Fetches both public keys, asks the current
168    /// signer to sign the handover, returns the wire event.
169    ///
170    /// Transitions the state machine from [`RotationPhase::Active`] to
171    /// [`RotationPhase::Transitioning`]. Refuses to re-fire if a
172    /// rotation is already in progress — emergency revocation is a
173    /// distinct call path (see [`Self::emergency_revoke_current`]).
174    ///
175    /// `transition_window`: how long both keys remain trusted. Default
176    /// per RFC is 30 days (see [`DEFAULT_TRANSITION_DAYS`]).
177    pub async fn begin_handover<N: PlatformSigner>(
178        &self,
179        next: &N,
180        transition_window: Duration,
181    ) -> Result<RotationEvent, RotationError> {
182        let mut inner = self.inner.lock().await;
183        if inner.phase != RotationPhase::Active {
184            return Err(RotationError::WrongPhase {
185                current: inner.phase,
186                expected: RotationPhase::Active,
187            });
188        }
189        if self.current.key_id() == next.key_id() {
190            return Err(RotationError::SameKey);
191        }
192        if transition_window <= Duration::zero() {
193            return Err(RotationError::InvalidTransitionWindow);
194        }
195
196        let now = Utc::now();
197        let payload = RotationEventPayload {
198            version: 1,
199            from_algorithm: self.current.algorithm(),
200            from_key_id: self.current.key_id().to_string(),
201            from_public_key_b64: b64_encode(&self.current.public_key_der().await?),
202            to_algorithm: next.algorithm(),
203            to_key_id: next.key_id().to_string(),
204            to_public_key_b64: b64_encode(&next.public_key_der().await?),
205            issued_at: now,
206            transition_until: now + transition_window,
207        };
208        let to_sign = RotationEvent::signed_bytes(&payload)?;
209        let sig_der = self.current.sign(&to_sign).await?;
210        let event = RotationEvent {
211            payload,
212            handover_signature_b64: b64_encode(&sig_der),
213        };
214        inner.phase = RotationPhase::Transitioning;
215        inner.last_event = Some(event.clone());
216        tracing::info!(
217            from_key_id = %self.current.key_id(),
218            to_key_id = %next.key_id(),
219            transition_window_days = transition_window.num_days(),
220            "platform signing-root rotation: handover signed"
221        );
222        Ok(event)
223    }
224
225    /// Step 2 of the runbook — operator calls this after the transition
226    /// window has elapsed and the runbook's manual `aws kms disable-key`
227    /// step is complete. Brings the state machine back to
228    /// [`RotationPhase::Active`].
229    ///
230    /// Note: the **state machine** does not switch its `current` signer
231    /// (this type is generic and immutable). The expectation is that
232    /// the registry process restarts with the new `MOCKFORGE_PLATFORM_SIGNING_KMS_KEY_ID`
233    /// pointing at the new ARN. This method exists for in-memory state
234    /// hygiene + audit completeness, and is the call site where the
235    /// `PlatformSigningKeyRetired` audit event fires.
236    pub async fn retire_old(&self) -> Result<(), RotationError> {
237        let mut inner = self.inner.lock().await;
238        if inner.phase != RotationPhase::Transitioning {
239            return Err(RotationError::WrongPhase {
240                current: inner.phase,
241                expected: RotationPhase::Transitioning,
242            });
243        }
244        // Clone the relevant fields out of the immutable borrow so we
245        // can subsequently mutate `inner.phase` without overlap.
246        let (from_id, to_id, transition_until) = {
247            let last = inner.last_event.as_ref().ok_or(RotationError::NoRotationInProgress)?;
248            (
249                last.payload.from_key_id.clone(),
250                last.payload.to_key_id.clone(),
251                last.payload.transition_until,
252            )
253        };
254        if Utc::now() < transition_until {
255            return Err(RotationError::TransitionStillOpen {
256                until: transition_until,
257            });
258        }
259        inner.phase = RotationPhase::Active;
260        tracing::info!(
261            from_key_id = %from_id,
262            to_key_id = %to_id,
263            "platform signing-root rotation: old key retired"
264        );
265        Ok(())
266    }
267
268    /// Emergency: revoke the current key without a successor. Used when
269    /// the active key is believed compromised and no new key has been
270    /// provisioned yet. After this returns, the registry refuses to
271    /// publish anything signed by the old key.
272    ///
273    /// This does NOT publish a rotation event — there's no new key to
274    /// hand over to. The runbook's "Emergency revocation" section
275    /// covers the operator-facing process (notify all hosted-mock
276    /// owners, then run [`Self::begin_handover`] with a fresh key once
277    /// it's available).
278    pub async fn emergency_revoke_current(&self) -> Result<(), RotationError> {
279        // We still take the lock so the state machine refuses
280        // concurrent handovers while the operator is responding to the
281        // incident. There's no phase transition — emergency revoke is
282        // a "shut everything down" signal handled by the caller.
283        let _inner = self.inner.lock().await;
284        tracing::error!(
285            key_id = %self.current.key_id(),
286            "platform signing-root: emergency revoke fired — registry refusing further signs"
287        );
288        Ok(())
289    }
290}
291
292fn b64_encode(bytes: &[u8]) -> String {
293    base64::engine::general_purpose::STANDARD.encode(bytes)
294}
295
296/// Errors the state machine can produce.
297#[derive(Debug, Error)]
298pub enum RotationError {
299    /// Tried to do something in the wrong phase (e.g. retire while still
300    /// active). Hints at an operator misstep in the runbook.
301    #[error("rotation in phase {current:?}, but operation requires {expected:?}")]
302    WrongPhase {
303        /// What phase the state machine is in.
304        current: RotationPhase,
305        /// What phase the operation expected.
306        expected: RotationPhase,
307    },
308
309    /// `begin_handover` was called with the same key id as the current
310    /// signer. A no-op rotation would still publish an event that
311    /// every host would refuse.
312    #[error("from-key and to-key have the same key id; nothing to rotate")]
313    SameKey,
314
315    /// `begin_handover` was called with a non-positive transition
316    /// window. Hosts must have a real overlap window or rotation is
317    /// just an atomic swap from their perspective.
318    #[error("transition window must be a positive duration")]
319    InvalidTransitionWindow,
320
321    /// `retire_old` was called but the transition window hasn't elapsed
322    /// yet. Tells the operator to wait or override (override is a
323    /// separate code path).
324    #[error("transition window is still open until {until}")]
325    TransitionStillOpen {
326        /// When the window closes.
327        until: DateTime<Utc>,
328    },
329
330    /// `retire_old` was called but no rotation has been started yet.
331    #[error("no rotation in progress")]
332    NoRotationInProgress,
333
334    /// JCS encoding failed. Should be impossible for the fixed-shape
335    /// payload, but propagated rather than panicked.
336    #[error("rotation encoding error: {0}")]
337    Encoding(String),
338
339    /// The underlying signer failed.
340    #[error(transparent)]
341    Signer(#[from] SignerError),
342}
343
344#[cfg(test)]
345mod tests {
346    use super::*;
347    use crate::signer::MockSigner;
348    use crate::verifier::verify_rotation_event;
349
350    #[tokio::test]
351    async fn happy_path_handover_emits_verifiable_event() {
352        let current = MockSigner::generate("key-old").unwrap();
353        let next = MockSigner::generate("key-new").unwrap();
354        let sm = RotationStateMachine::new(current);
355        assert_eq!(sm.phase().await, RotationPhase::Active);
356
357        let event = sm.begin_handover(&next, Duration::days(30)).await.expect("handover succeeds");
358
359        assert_eq!(sm.phase().await, RotationPhase::Transitioning);
360        assert_eq!(event.payload.from_key_id, "key-old");
361        assert_eq!(event.payload.to_key_id, "key-new");
362        assert_eq!(event.payload.version, 1);
363        assert_eq!(event.payload.transition_until - event.payload.issued_at, Duration::days(30));
364
365        // Round-trip through the verifier — confirms the bytes the
366        // signer signed are exactly what the verifier reconstructs.
367        verify_rotation_event(&event).expect("rotation event verifies");
368    }
369
370    #[tokio::test]
371    async fn cannot_begin_handover_while_transitioning() {
372        let current = MockSigner::generate("k1").unwrap();
373        let next1 = MockSigner::generate("k2").unwrap();
374        let next2 = MockSigner::generate("k3").unwrap();
375        let sm = RotationStateMachine::new(current);
376        sm.begin_handover(&next1, Duration::days(30)).await.unwrap();
377        let err = sm.begin_handover(&next2, Duration::days(30)).await.unwrap_err();
378        assert!(matches!(err, RotationError::WrongPhase { .. }));
379    }
380
381    #[tokio::test]
382    async fn refuses_same_key_handover() {
383        // Two distinct signers with the same id — operator misconfig.
384        // Generating two `MockSigner::generate("k")` produces different
385        // keypairs, so we need to share the key id via direct
386        // construction — easiest path: same MockSigner is used twice.
387        let current = MockSigner::generate("same-id").unwrap();
388        let next = MockSigner::generate("same-id").unwrap();
389        let sm = RotationStateMachine::new(current);
390        let err = sm.begin_handover(&next, Duration::days(30)).await.unwrap_err();
391        assert!(matches!(err, RotationError::SameKey));
392    }
393
394    #[tokio::test]
395    async fn refuses_non_positive_transition_window() {
396        let current = MockSigner::generate("k1").unwrap();
397        let next = MockSigner::generate("k2").unwrap();
398        let sm = RotationStateMachine::new(current);
399        let err = sm.begin_handover(&next, Duration::zero()).await.unwrap_err();
400        assert!(matches!(err, RotationError::InvalidTransitionWindow));
401    }
402
403    #[tokio::test]
404    async fn retire_old_refuses_while_window_open() {
405        let current = MockSigner::generate("k1").unwrap();
406        let next = MockSigner::generate("k2").unwrap();
407        let sm = RotationStateMachine::new(current);
408        sm.begin_handover(&next, Duration::days(30)).await.unwrap();
409        let err = sm.retire_old().await.unwrap_err();
410        assert!(matches!(err, RotationError::TransitionStillOpen { .. }));
411    }
412
413    #[tokio::test]
414    async fn retire_old_succeeds_after_window() {
415        let current = MockSigner::generate("k1").unwrap();
416        let next = MockSigner::generate("k2").unwrap();
417        let sm = RotationStateMachine::new(current);
418        // Use a negative window via a small positive then manual
419        // overwrite — clearer to use a 1ms window and sleep.
420        sm.begin_handover(&next, Duration::milliseconds(1)).await.unwrap();
421        tokio::time::sleep(std::time::Duration::from_millis(5)).await;
422        sm.retire_old().await.expect("retire_old after window closes");
423        assert_eq!(sm.phase().await, RotationPhase::Active);
424    }
425
426    #[tokio::test]
427    async fn rotation_event_jcs_is_deterministic() {
428        // Re-serializing the same payload must yield identical bytes —
429        // any non-determinism here silently invalidates signatures
430        // because the verifier reconstructs the bytes from the
431        // payload.
432        let payload = RotationEventPayload {
433            version: 1,
434            from_algorithm: SigningAlgorithm::EcdsaSha256P256,
435            from_key_id: "old".into(),
436            from_public_key_b64: "AAAA".into(),
437            to_algorithm: SigningAlgorithm::EcdsaSha256P256,
438            to_key_id: "new".into(),
439            to_public_key_b64: "BBBB".into(),
440            issued_at: Utc::now(),
441            transition_until: Utc::now() + Duration::days(30),
442        };
443        let a = RotationEvent::signed_bytes(&payload).unwrap();
444        let b = RotationEvent::signed_bytes(&payload).unwrap();
445        assert_eq!(a, b);
446        // And the bytes start with the domain prefix.
447        assert!(a.starts_with(ROTATION_DOMAIN_PREFIX));
448    }
449}