mockforge_platform_signing/rotation.rs
1//! Dual-control rotation state machine + on-the-wire rotation event.
2//!
3//! See RFC §9 for the procedure this implements. The operator-facing
4//! runbook is in `docs/plugins/security/platform-signing-rotation-runbook.md`.
5//!
6//! # Phases
7//!
8//! ```text
9//! Active ── begin_handover ──▶ Transitioning
10//! (cur) (cur + next trusted)
11//! │
12//! │ retire_old
13//! ▼
14//! Active(next)
15//! ```
16//!
17//! The state machine does not talk to AWS directly when retiring the
18//! old key (operators do that via the runbook + `aws kms disable-key`)
19//! — it just gates the on-the-wire event so plugin-hosts only see
20//! state changes that match the documented sequence.
21//!
22//! # Wire format
23//!
24//! [`RotationEvent`] is what the registry publishes; plugin-hosts pick
25//! it up (poll or push) and pass it to
26//! [`crate::verifier::verify_rotation_event`]. It contains:
27//!
28//! - the **from** key id + DER public key (the current trust anchor)
29//! - the **to** key id + DER public key (the new trust anchor)
30//! - a `transition_until` timestamp (both keys are trusted until this)
31//! - a signature over the canonical JCS payload, produced by the
32//! **from** key — this is the cryptographic handover that proves
33//! the new key was authorized by the predecessor.
34//!
35//! Domain prefix: `mockforge-platform-rotation/v1\n` is prepended before
36//! signing, mirroring the prefix discipline in
37//! `mockforge-plugin-host::signing` (cross-protocol replay defense).
38
39use base64::Engine;
40use chrono::{DateTime, Duration, Utc};
41use serde::{Deserialize, Serialize};
42use thiserror::Error;
43use tokio::sync::Mutex;
44
45use crate::signer::{PlatformSigner, SignerError, SigningAlgorithm};
46
47/// Domain-separation prefix for the rotation-event signed bytes.
48///
49/// Same discipline as `mockforge-plugin-host::signing` — prevents a
50/// signature over any other JSON document with a matching prefix from
51/// being replayed as a platform rotation event.
52pub const ROTATION_DOMAIN_PREFIX: &[u8] = b"mockforge-platform-rotation/v1\n";
53
54/// Default transition window. Matches RFC §9 ("≥ 30 days").
55pub const DEFAULT_TRANSITION_DAYS: i64 = 30;
56
57/// Where the rotation state machine currently sits.
58#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
59#[serde(rename_all = "kebab-case")]
60pub enum RotationPhase {
61 /// Single active key. Steady state.
62 Active,
63 /// Both old and new keys are trusted. Hosts accept signatures from
64 /// either. Lasts until `transition_until` passes.
65 Transitioning,
66}
67
68/// Inner payload of a [`RotationEvent`] — the bytes that get signed.
69///
70/// Serialized via [`serde_jcs`] (RFC 8785 canonical JSON) so the byte
71/// representation is stable across hosts. Any drift in field order or
72/// number encoding silently invalidates the signature, so canonical
73/// JSON is non-negotiable.
74#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
75#[serde(deny_unknown_fields, rename_all = "camelCase")]
76pub struct RotationEventPayload {
77 /// Schema version. Always `1` for this crate.
78 pub version: u32,
79 /// Algorithm of the **from** key — what signed this payload.
80 pub from_algorithm: SigningAlgorithm,
81 /// Opaque id of the previous key (e.g. KMS ARN).
82 pub from_key_id: String,
83 /// `SubjectPublicKeyInfo` (DER) of the previous key, base64-encoded.
84 pub from_public_key_b64: String,
85 /// Algorithm of the **to** key.
86 pub to_algorithm: SigningAlgorithm,
87 /// Opaque id of the new key.
88 pub to_key_id: String,
89 /// `SubjectPublicKeyInfo` (DER) of the new key, base64-encoded.
90 pub to_public_key_b64: String,
91 /// UTC instant at which the transition window opened.
92 pub issued_at: DateTime<Utc>,
93 /// UTC instant after which the previous key should no longer be
94 /// trusted. Plugin-hosts MUST evict the `from` key from their trust
95 /// cache once their wall clock passes this.
96 pub transition_until: DateTime<Utc>,
97}
98
99/// On-the-wire rotation event — published by the registry, consumed by
100/// every plugin-host.
101#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
102#[serde(deny_unknown_fields, rename_all = "camelCase")]
103pub struct RotationEvent {
104 /// The signed payload.
105 pub payload: RotationEventPayload,
106 /// DER-encoded ECDSA signature over
107 /// `ROTATION_DOMAIN_PREFIX || serde_jcs(payload)`, base64-encoded.
108 /// Signed by `payload.from_key_id`.
109 pub handover_signature_b64: String,
110}
111
112impl RotationEvent {
113 /// Canonical bytes that were signed to produce
114 /// `handover_signature_b64`. Used by the verifier; exposed so tests
115 /// and the audit log can show the operator exactly what was signed.
116 pub fn signed_bytes(payload: &RotationEventPayload) -> Result<Vec<u8>, RotationError> {
117 let canonical = serde_jcs::to_vec(payload)
118 .map_err(|e| RotationError::Encoding(format!("serde_jcs failed: {e}")))?;
119 let mut out = Vec::with_capacity(ROTATION_DOMAIN_PREFIX.len() + canonical.len());
120 out.extend_from_slice(ROTATION_DOMAIN_PREFIX);
121 out.extend_from_slice(&canonical);
122 Ok(out)
123 }
124}
125
126/// Drives the rotation procedure end-to-end.
127///
128/// One state machine corresponds to one platform deployment. Hold this
129/// behind an `Arc<Mutex<_>>` if multiple operators can drive it
130/// concurrently — the type itself is `!Sync` so the compiler enforces
131/// serialized access through the mutex.
132pub struct RotationStateMachine<S: PlatformSigner> {
133 current: S,
134 inner: Mutex<RotationInner>,
135}
136
137#[derive(Debug)]
138struct RotationInner {
139 phase: RotationPhase,
140 last_event: Option<RotationEvent>,
141}
142
143impl<S: PlatformSigner> RotationStateMachine<S> {
144 /// Build a fresh state machine seeded with the active key. Phase is
145 /// [`RotationPhase::Active`].
146 pub fn new(current: S) -> Self {
147 Self {
148 current,
149 inner: Mutex::new(RotationInner {
150 phase: RotationPhase::Active,
151 last_event: None,
152 }),
153 }
154 }
155
156 /// Current phase.
157 pub async fn phase(&self) -> RotationPhase {
158 self.inner.lock().await.phase
159 }
160
161 /// Most recent rotation event published, if any.
162 pub async fn last_event(&self) -> Option<RotationEvent> {
163 self.inner.lock().await.last_event.clone()
164 }
165
166 /// Step 1 of the runbook (after the operator has generated the new
167 /// KMS key out-of-band). Fetches both public keys, asks the current
168 /// signer to sign the handover, returns the wire event.
169 ///
170 /// Transitions the state machine from [`RotationPhase::Active`] to
171 /// [`RotationPhase::Transitioning`]. Refuses to re-fire if a
172 /// rotation is already in progress — emergency revocation is a
173 /// distinct call path (see [`Self::emergency_revoke_current`]).
174 ///
175 /// `transition_window`: how long both keys remain trusted. Default
176 /// per RFC is 30 days (see [`DEFAULT_TRANSITION_DAYS`]).
177 pub async fn begin_handover<N: PlatformSigner>(
178 &self,
179 next: &N,
180 transition_window: Duration,
181 ) -> Result<RotationEvent, RotationError> {
182 let mut inner = self.inner.lock().await;
183 if inner.phase != RotationPhase::Active {
184 return Err(RotationError::WrongPhase {
185 current: inner.phase,
186 expected: RotationPhase::Active,
187 });
188 }
189 if self.current.key_id() == next.key_id() {
190 return Err(RotationError::SameKey);
191 }
192 if transition_window <= Duration::zero() {
193 return Err(RotationError::InvalidTransitionWindow);
194 }
195
196 let now = Utc::now();
197 let payload = RotationEventPayload {
198 version: 1,
199 from_algorithm: self.current.algorithm(),
200 from_key_id: self.current.key_id().to_string(),
201 from_public_key_b64: b64_encode(&self.current.public_key_der().await?),
202 to_algorithm: next.algorithm(),
203 to_key_id: next.key_id().to_string(),
204 to_public_key_b64: b64_encode(&next.public_key_der().await?),
205 issued_at: now,
206 transition_until: now + transition_window,
207 };
208 let to_sign = RotationEvent::signed_bytes(&payload)?;
209 let sig_der = self.current.sign(&to_sign).await?;
210 let event = RotationEvent {
211 payload,
212 handover_signature_b64: b64_encode(&sig_der),
213 };
214 inner.phase = RotationPhase::Transitioning;
215 inner.last_event = Some(event.clone());
216 tracing::info!(
217 from_key_id = %self.current.key_id(),
218 to_key_id = %next.key_id(),
219 transition_window_days = transition_window.num_days(),
220 "platform signing-root rotation: handover signed"
221 );
222 Ok(event)
223 }
224
225 /// Step 2 of the runbook — operator calls this after the transition
226 /// window has elapsed and the runbook's manual `aws kms disable-key`
227 /// step is complete. Brings the state machine back to
228 /// [`RotationPhase::Active`].
229 ///
230 /// Note: the **state machine** does not switch its `current` signer
231 /// (this type is generic and immutable). The expectation is that
232 /// the registry process restarts with the new `MOCKFORGE_PLATFORM_SIGNING_KMS_KEY_ID`
233 /// pointing at the new ARN. This method exists for in-memory state
234 /// hygiene + audit completeness, and is the call site where the
235 /// `PlatformSigningKeyRetired` audit event fires.
236 pub async fn retire_old(&self) -> Result<(), RotationError> {
237 let mut inner = self.inner.lock().await;
238 if inner.phase != RotationPhase::Transitioning {
239 return Err(RotationError::WrongPhase {
240 current: inner.phase,
241 expected: RotationPhase::Transitioning,
242 });
243 }
244 // Clone the relevant fields out of the immutable borrow so we
245 // can subsequently mutate `inner.phase` without overlap.
246 let (from_id, to_id, transition_until) = {
247 let last = inner.last_event.as_ref().ok_or(RotationError::NoRotationInProgress)?;
248 (
249 last.payload.from_key_id.clone(),
250 last.payload.to_key_id.clone(),
251 last.payload.transition_until,
252 )
253 };
254 if Utc::now() < transition_until {
255 return Err(RotationError::TransitionStillOpen {
256 until: transition_until,
257 });
258 }
259 inner.phase = RotationPhase::Active;
260 tracing::info!(
261 from_key_id = %from_id,
262 to_key_id = %to_id,
263 "platform signing-root rotation: old key retired"
264 );
265 Ok(())
266 }
267
268 /// Emergency: revoke the current key without a successor. Used when
269 /// the active key is believed compromised and no new key has been
270 /// provisioned yet. After this returns, the registry refuses to
271 /// publish anything signed by the old key.
272 ///
273 /// This does NOT publish a rotation event — there's no new key to
274 /// hand over to. The runbook's "Emergency revocation" section
275 /// covers the operator-facing process (notify all hosted-mock
276 /// owners, then run [`Self::begin_handover`] with a fresh key once
277 /// it's available).
278 pub async fn emergency_revoke_current(&self) -> Result<(), RotationError> {
279 // We still take the lock so the state machine refuses
280 // concurrent handovers while the operator is responding to the
281 // incident. There's no phase transition — emergency revoke is
282 // a "shut everything down" signal handled by the caller.
283 let _inner = self.inner.lock().await;
284 tracing::error!(
285 key_id = %self.current.key_id(),
286 "platform signing-root: emergency revoke fired — registry refusing further signs"
287 );
288 Ok(())
289 }
290}
291
292fn b64_encode(bytes: &[u8]) -> String {
293 base64::engine::general_purpose::STANDARD.encode(bytes)
294}
295
296/// Errors the state machine can produce.
297#[derive(Debug, Error)]
298pub enum RotationError {
299 /// Tried to do something in the wrong phase (e.g. retire while still
300 /// active). Hints at an operator misstep in the runbook.
301 #[error("rotation in phase {current:?}, but operation requires {expected:?}")]
302 WrongPhase {
303 /// What phase the state machine is in.
304 current: RotationPhase,
305 /// What phase the operation expected.
306 expected: RotationPhase,
307 },
308
309 /// `begin_handover` was called with the same key id as the current
310 /// signer. A no-op rotation would still publish an event that
311 /// every host would refuse.
312 #[error("from-key and to-key have the same key id; nothing to rotate")]
313 SameKey,
314
315 /// `begin_handover` was called with a non-positive transition
316 /// window. Hosts must have a real overlap window or rotation is
317 /// just an atomic swap from their perspective.
318 #[error("transition window must be a positive duration")]
319 InvalidTransitionWindow,
320
321 /// `retire_old` was called but the transition window hasn't elapsed
322 /// yet. Tells the operator to wait or override (override is a
323 /// separate code path).
324 #[error("transition window is still open until {until}")]
325 TransitionStillOpen {
326 /// When the window closes.
327 until: DateTime<Utc>,
328 },
329
330 /// `retire_old` was called but no rotation has been started yet.
331 #[error("no rotation in progress")]
332 NoRotationInProgress,
333
334 /// JCS encoding failed. Should be impossible for the fixed-shape
335 /// payload, but propagated rather than panicked.
336 #[error("rotation encoding error: {0}")]
337 Encoding(String),
338
339 /// The underlying signer failed.
340 #[error(transparent)]
341 Signer(#[from] SignerError),
342}
343
344#[cfg(test)]
345mod tests {
346 use super::*;
347 use crate::signer::MockSigner;
348 use crate::verifier::verify_rotation_event;
349
350 #[tokio::test]
351 async fn happy_path_handover_emits_verifiable_event() {
352 let current = MockSigner::generate("key-old").unwrap();
353 let next = MockSigner::generate("key-new").unwrap();
354 let sm = RotationStateMachine::new(current);
355 assert_eq!(sm.phase().await, RotationPhase::Active);
356
357 let event = sm.begin_handover(&next, Duration::days(30)).await.expect("handover succeeds");
358
359 assert_eq!(sm.phase().await, RotationPhase::Transitioning);
360 assert_eq!(event.payload.from_key_id, "key-old");
361 assert_eq!(event.payload.to_key_id, "key-new");
362 assert_eq!(event.payload.version, 1);
363 assert_eq!(event.payload.transition_until - event.payload.issued_at, Duration::days(30));
364
365 // Round-trip through the verifier — confirms the bytes the
366 // signer signed are exactly what the verifier reconstructs.
367 verify_rotation_event(&event).expect("rotation event verifies");
368 }
369
370 #[tokio::test]
371 async fn cannot_begin_handover_while_transitioning() {
372 let current = MockSigner::generate("k1").unwrap();
373 let next1 = MockSigner::generate("k2").unwrap();
374 let next2 = MockSigner::generate("k3").unwrap();
375 let sm = RotationStateMachine::new(current);
376 sm.begin_handover(&next1, Duration::days(30)).await.unwrap();
377 let err = sm.begin_handover(&next2, Duration::days(30)).await.unwrap_err();
378 assert!(matches!(err, RotationError::WrongPhase { .. }));
379 }
380
381 #[tokio::test]
382 async fn refuses_same_key_handover() {
383 // Two distinct signers with the same id — operator misconfig.
384 // Generating two `MockSigner::generate("k")` produces different
385 // keypairs, so we need to share the key id via direct
386 // construction — easiest path: same MockSigner is used twice.
387 let current = MockSigner::generate("same-id").unwrap();
388 let next = MockSigner::generate("same-id").unwrap();
389 let sm = RotationStateMachine::new(current);
390 let err = sm.begin_handover(&next, Duration::days(30)).await.unwrap_err();
391 assert!(matches!(err, RotationError::SameKey));
392 }
393
394 #[tokio::test]
395 async fn refuses_non_positive_transition_window() {
396 let current = MockSigner::generate("k1").unwrap();
397 let next = MockSigner::generate("k2").unwrap();
398 let sm = RotationStateMachine::new(current);
399 let err = sm.begin_handover(&next, Duration::zero()).await.unwrap_err();
400 assert!(matches!(err, RotationError::InvalidTransitionWindow));
401 }
402
403 #[tokio::test]
404 async fn retire_old_refuses_while_window_open() {
405 let current = MockSigner::generate("k1").unwrap();
406 let next = MockSigner::generate("k2").unwrap();
407 let sm = RotationStateMachine::new(current);
408 sm.begin_handover(&next, Duration::days(30)).await.unwrap();
409 let err = sm.retire_old().await.unwrap_err();
410 assert!(matches!(err, RotationError::TransitionStillOpen { .. }));
411 }
412
413 #[tokio::test]
414 async fn retire_old_succeeds_after_window() {
415 let current = MockSigner::generate("k1").unwrap();
416 let next = MockSigner::generate("k2").unwrap();
417 let sm = RotationStateMachine::new(current);
418 // Use a negative window via a small positive then manual
419 // overwrite — clearer to use a 1ms window and sleep.
420 sm.begin_handover(&next, Duration::milliseconds(1)).await.unwrap();
421 tokio::time::sleep(std::time::Duration::from_millis(5)).await;
422 sm.retire_old().await.expect("retire_old after window closes");
423 assert_eq!(sm.phase().await, RotationPhase::Active);
424 }
425
426 #[tokio::test]
427 async fn rotation_event_jcs_is_deterministic() {
428 // Re-serializing the same payload must yield identical bytes —
429 // any non-determinism here silently invalidates signatures
430 // because the verifier reconstructs the bytes from the
431 // payload.
432 let payload = RotationEventPayload {
433 version: 1,
434 from_algorithm: SigningAlgorithm::EcdsaSha256P256,
435 from_key_id: "old".into(),
436 from_public_key_b64: "AAAA".into(),
437 to_algorithm: SigningAlgorithm::EcdsaSha256P256,
438 to_key_id: "new".into(),
439 to_public_key_b64: "BBBB".into(),
440 issued_at: Utc::now(),
441 transition_until: Utc::now() + Duration::days(30),
442 };
443 let a = RotationEvent::signed_bytes(&payload).unwrap();
444 let b = RotationEvent::signed_bytes(&payload).unwrap();
445 assert_eq!(a, b);
446 // And the bytes start with the domain prefix.
447 assert!(a.starts_with(ROTATION_DOMAIN_PREFIX));
448 }
449}