Skip to main content

chio_federation/
trust_establishment.rs

1//! Cross-kernel trust establishment via mTLS-style handshake.
2//!
3//! Two kernels bootstrap mutual trust by exchanging signed challenges and
4//! pinning each other's kernel signing public keys. Once pinned, the
5//! [`FederationPeer`] set lives alongside the existing federation-state
6//! primitives (`chio-federation` already persists activation, governance, and
7//! reputation artifacts; the peer set is a new surface with the same
8//! persistence semantics: in-memory by default, pluggable store).
9//!
10//! ## Handshake summary
11//!
12//! 1. Each side builds a [`HandshakeChallenge`] binding `(local_kernel_id,
13//!    remote_kernel_id, nonce, timestamp)` and signs it with its own
14//!    kernel key.
15//! 2. Peers exchange their [`PeerHandshakeEnvelope`] (challenge + signature
16//!    + declared public key).
17//! 3. Each side verifies the remote envelope's signature against the
18//!    declared public key, checks freshness (`nonce`, `timestamp` skew),
19//!    verifies that the key matches either a pre-configured trust anchor
20//!    or an already-pinned peer, and then pins the remote public key as a
21//!    [`FederationPeer`] with a rotation deadline derived from the
22//!    configured freshness window.
23//!
24//! ## Freshness rotation
25//!
26//! A [`FederationPeer`] carries a `rotation_due` timestamp. After that
27//! timestamp the peer is considered stale and is refused fail-closed by
28//! [`KernelTrustExchange::resolve`]; the two kernels must re-run the
29//! handshake to re-pin the key. Rotation never silently renews: the
30//! caller must explicitly issue a new handshake.
31
32use std::collections::HashMap;
33use std::sync::{Mutex, PoisonError};
34
35use chio_core_types::canonical::canonical_json_bytes;
36use chio_core_types::crypto::{Ed25519Backend, Keypair, PublicKey, Signature, SigningBackend};
37use serde::{Deserialize, Serialize};
38
39pub const FEDERATION_HANDSHAKE_SCHEMA: &str = "chio.federation-kernel-handshake.v1";
40
41/// Default freshness window applied to newly-pinned peers when the caller
42/// does not override it. Twelve hours strikes a balance between operator
43/// ergonomics (no paging at 3am to re-handshake) and the bounded-trust
44/// guarantee the federation layer promises.
45pub const DEFAULT_ROTATION_WINDOW_SECS: u64 = 12 * 60 * 60;
46
47/// Maximum clock skew tolerated between the two kernels during a
48/// handshake. Envelopes older or further-in-the-future than this window
49/// are rejected.
50pub const DEFAULT_HANDSHAKE_MAX_SKEW_SECS: u64 = 5 * 60;
51
52/// Pinned federation peer entry.
53#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
54#[serde(rename_all = "camelCase", deny_unknown_fields)]
55pub struct FederationPeer {
56    pub kernel_id: String,
57    pub public_key: PublicKey,
58    /// Unix seconds at which the peer was last pinned via a successful
59    /// handshake.
60    pub established_at: u64,
61    /// Unix seconds at which the pin expires. After this timestamp the
62    /// peer is treated as stale and MUST be re-handshaked before any
63    /// federation-level operation is accepted against it.
64    pub rotation_due: u64,
65}
66
67impl FederationPeer {
68    /// Returns `true` when the peer's pin is still within its freshness
69    /// window relative to `now`.
70    pub fn is_fresh(&self, now: u64) -> bool {
71        now < self.rotation_due
72    }
73}
74
75/// Challenge body signed by one kernel during the handshake.
76#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
77#[serde(rename_all = "camelCase", deny_unknown_fields)]
78pub struct HandshakeChallenge {
79    pub schema: String,
80    pub local_kernel_id: String,
81    pub remote_kernel_id: String,
82    pub nonce: String,
83    pub timestamp: u64,
84}
85
86impl HandshakeChallenge {
87    pub fn new(
88        local_kernel_id: impl Into<String>,
89        remote_kernel_id: impl Into<String>,
90        nonce: impl Into<String>,
91        timestamp: u64,
92    ) -> Self {
93        Self {
94            schema: FEDERATION_HANDSHAKE_SCHEMA.to_string(),
95            local_kernel_id: local_kernel_id.into(),
96            remote_kernel_id: remote_kernel_id.into(),
97            nonce: nonce.into(),
98            timestamp,
99        }
100    }
101
102    pub fn canonical_bytes(&self) -> Result<Vec<u8>, PeerHandshakeError> {
103        canonical_json_bytes(self).map_err(|e| PeerHandshakeError::CanonicalJson(e.to_string()))
104    }
105}
106
107/// Envelope one kernel sends to the other during a handshake.
108#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
109#[serde(rename_all = "camelCase", deny_unknown_fields)]
110pub struct PeerHandshakeEnvelope {
111    pub challenge: HandshakeChallenge,
112    pub declared_public_key: PublicKey,
113    pub signature: Signature,
114}
115
116impl PeerHandshakeEnvelope {
117    /// Build a signed handshake envelope from `local` addressed to `remote`.
118    pub fn sign(
119        local_kernel_id: &str,
120        remote_kernel_id: &str,
121        nonce: &str,
122        timestamp: u64,
123        local_keypair: &Keypair,
124    ) -> Result<Self, PeerHandshakeError> {
125        let challenge =
126            HandshakeChallenge::new(local_kernel_id, remote_kernel_id, nonce, timestamp);
127        let bytes = challenge.canonical_bytes()?;
128        let backend = Ed25519Backend::new(local_keypair.clone());
129        let signature = backend
130            .sign_bytes(&bytes)
131            .map_err(|e| PeerHandshakeError::SigningFailed(e.to_string()))?;
132        Ok(Self {
133            challenge,
134            declared_public_key: local_keypair.public_key(),
135            signature,
136        })
137    }
138
139    /// Verify this envelope in isolation (signature valid for declared
140    /// public key; schema is the expected version). Callers still need to
141    /// confirm the envelope targets them and fits within the freshness
142    /// window: [`KernelTrustExchange::accept_envelope`] is the convenient
143    /// one-shot version that enforces all of that.
144    pub fn verify_signature(&self) -> Result<(), PeerHandshakeError> {
145        if self.challenge.schema != FEDERATION_HANDSHAKE_SCHEMA {
146            return Err(PeerHandshakeError::UnsupportedSchema(
147                self.challenge.schema.clone(),
148            ));
149        }
150        let bytes = self.challenge.canonical_bytes()?;
151        if !self.declared_public_key.verify(&bytes, &self.signature) {
152            return Err(PeerHandshakeError::InvalidSignature);
153        }
154        Ok(())
155    }
156}
157
158/// Errors raised by the trust-establishment primitives. Every variant is
159/// fail-closed: callers MUST refuse to pin a peer when any step fails.
160#[derive(Debug, thiserror::Error, PartialEq, Eq)]
161pub enum PeerHandshakeError {
162    #[error("unsupported handshake schema: {0}")]
163    UnsupportedSchema(String),
164
165    #[error("canonical JSON encoding failed: {0}")]
166    CanonicalJson(String),
167
168    #[error("handshake signing failed: {0}")]
169    SigningFailed(String),
170
171    #[error("remote handshake signature is invalid")]
172    InvalidSignature,
173
174    #[error("remote envelope is addressed to kernel_id {addressed_to} but we are {actual}")]
175    AddressMismatch {
176        addressed_to: String,
177        actual: String,
178    },
179
180    #[error("remote envelope declares self as kernel_id {declared} but we expected {expected}")]
181    KernelIdMismatch { declared: String, expected: String },
182
183    #[error("remote envelope timestamp {envelope} drifts from local clock {local} beyond {skew}s")]
184    ClockSkewExceeded {
185        envelope: u64,
186        local: u64,
187        skew: u64,
188    },
189
190    #[error("peer {0} is not pinned; run a handshake before resolving")]
191    PeerNotPinned(String),
192
193    #[error("peer {0} is stale and must be re-handshaked before use")]
194    PeerStale(String),
195
196    #[error("peer {0} is not trusted for first contact; configure a trust anchor before accepting handshakes")]
197    MissingTrustAnchor(String),
198
199    #[error("peer {kernel_id} declared unexpected public key; expected {expected}, got {actual}")]
200    UnexpectedPeerKey {
201        kernel_id: String,
202        expected: String,
203        actual: String,
204    },
205
206    #[error("trust store is poisoned and cannot service requests")]
207    StorePoisoned,
208}
209
210impl<T> From<PoisonError<T>> for PeerHandshakeError {
211    fn from(_: PoisonError<T>) -> Self {
212        PeerHandshakeError::StorePoisoned
213    }
214}
215
216/// In-memory pinned-peer store used by [`KernelTrustExchange`]. A runtime
217/// embedding can replace it with a persistent backing store by dropping a
218/// new impl of the same trait in place; this crate keeps the default
219/// lightweight for test-plane and single-host deployments.
220pub trait FederationPeerStore: Send + Sync {
221    fn insert(&self, peer: FederationPeer) -> Result<(), PeerHandshakeError>;
222    fn get(&self, kernel_id: &str) -> Result<Option<FederationPeer>, PeerHandshakeError>;
223    fn remove(&self, kernel_id: &str) -> Result<Option<FederationPeer>, PeerHandshakeError>;
224    fn snapshot(&self) -> Result<Vec<FederationPeer>, PeerHandshakeError>;
225}
226
227/// Default in-memory peer store.
228#[derive(Debug, Default)]
229pub struct InMemoryPeerStore {
230    inner: Mutex<HashMap<String, FederationPeer>>,
231}
232
233impl InMemoryPeerStore {
234    pub fn new() -> Self {
235        Self::default()
236    }
237}
238
239impl FederationPeerStore for InMemoryPeerStore {
240    fn insert(&self, peer: FederationPeer) -> Result<(), PeerHandshakeError> {
241        let mut guard = self.inner.lock()?;
242        guard.insert(peer.kernel_id.clone(), peer);
243        Ok(())
244    }
245
246    fn get(&self, kernel_id: &str) -> Result<Option<FederationPeer>, PeerHandshakeError> {
247        let guard = self.inner.lock()?;
248        Ok(guard.get(kernel_id).cloned())
249    }
250
251    fn remove(&self, kernel_id: &str) -> Result<Option<FederationPeer>, PeerHandshakeError> {
252        let mut guard = self.inner.lock()?;
253        Ok(guard.remove(kernel_id))
254    }
255
256    fn snapshot(&self) -> Result<Vec<FederationPeer>, PeerHandshakeError> {
257        let guard = self.inner.lock()?;
258        Ok(guard.values().cloned().collect())
259    }
260}
261
262/// Configuration knobs for a [`KernelTrustExchange`]. Defaults match the
263/// module-level constants.
264#[derive(Debug, Clone, Copy)]
265pub struct KernelTrustExchangeConfig {
266    pub rotation_window_secs: u64,
267    pub max_handshake_skew_secs: u64,
268}
269
270impl Default for KernelTrustExchangeConfig {
271    fn default() -> Self {
272        Self {
273            rotation_window_secs: DEFAULT_ROTATION_WINDOW_SECS,
274            max_handshake_skew_secs: DEFAULT_HANDSHAKE_MAX_SKEW_SECS,
275        }
276    }
277}
278
279/// Primitive that drives the mTLS-style key exchange between two kernels.
280///
281/// One [`KernelTrustExchange`] lives per local kernel. It owns the local
282/// kernel's identity + signing keypair, a peer store, and a clock source.
283/// Callers use [`KernelTrustExchange::local_envelope`] to build a
284/// challenge envelope to send to the remote, and
285/// [`KernelTrustExchange::accept_envelope`] to verify an incoming envelope
286/// and pin the remote peer.
287pub struct KernelTrustExchange {
288    local_kernel_id: String,
289    local_keypair: Keypair,
290    config: KernelTrustExchangeConfig,
291    store: Box<dyn FederationPeerStore>,
292    trusted_peers: HashMap<String, PublicKey>,
293}
294
295impl core::fmt::Debug for KernelTrustExchange {
296    fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
297        f.debug_struct("KernelTrustExchange")
298            .field("local_kernel_id", &self.local_kernel_id)
299            .field("config", &self.config)
300            .finish_non_exhaustive()
301    }
302}
303
304impl KernelTrustExchange {
305    pub fn new(local_kernel_id: impl Into<String>, local_keypair: Keypair) -> Self {
306        Self {
307            local_kernel_id: local_kernel_id.into(),
308            local_keypair,
309            config: KernelTrustExchangeConfig::default(),
310            store: Box::new(InMemoryPeerStore::new()),
311            trusted_peers: HashMap::new(),
312        }
313    }
314
315    pub fn with_config(mut self, config: KernelTrustExchangeConfig) -> Self {
316        self.config = config;
317        self
318    }
319
320    pub fn with_store(mut self, store: Box<dyn FederationPeerStore>) -> Self {
321        self.store = store;
322        self
323    }
324
325    pub fn with_trusted_peer(
326        mut self,
327        kernel_id: impl Into<String>,
328        public_key: PublicKey,
329    ) -> Self {
330        self.trusted_peers.insert(kernel_id.into(), public_key);
331        self
332    }
333
334    pub fn local_kernel_id(&self) -> &str {
335        &self.local_kernel_id
336    }
337
338    pub fn local_public_key(&self) -> PublicKey {
339        self.local_keypair.public_key()
340    }
341
342    pub fn rotation_window_secs(&self) -> u64 {
343        self.config.rotation_window_secs
344    }
345
346    /// Build the local kernel's signed envelope addressed to `remote_kernel_id`.
347    pub fn local_envelope(
348        &self,
349        remote_kernel_id: &str,
350        nonce: &str,
351        now: u64,
352    ) -> Result<PeerHandshakeEnvelope, PeerHandshakeError> {
353        PeerHandshakeEnvelope::sign(
354            &self.local_kernel_id,
355            remote_kernel_id,
356            nonce,
357            now,
358            &self.local_keypair,
359        )
360    }
361
362    /// Accept an envelope received from `expected_remote_kernel_id` at
363    /// local clock `now`. Verifies the signature, the addressee, the
364    /// claimed remote kernel ID, the clock skew, and the expected remote
365    /// public key; on success, pins the remote public key as a fresh
366    /// [`FederationPeer`].
367    pub fn accept_envelope(
368        &self,
369        envelope: &PeerHandshakeEnvelope,
370        expected_remote_kernel_id: &str,
371        now: u64,
372    ) -> Result<FederationPeer, PeerHandshakeError> {
373        envelope.verify_signature()?;
374
375        if envelope.challenge.remote_kernel_id != self.local_kernel_id {
376            return Err(PeerHandshakeError::AddressMismatch {
377                addressed_to: envelope.challenge.remote_kernel_id.clone(),
378                actual: self.local_kernel_id.clone(),
379            });
380        }
381        if envelope.challenge.local_kernel_id != expected_remote_kernel_id {
382            return Err(PeerHandshakeError::KernelIdMismatch {
383                declared: envelope.challenge.local_kernel_id.clone(),
384                expected: expected_remote_kernel_id.to_string(),
385            });
386        }
387
388        let envelope_ts = envelope.challenge.timestamp;
389        let skew = self.config.max_handshake_skew_secs;
390        let drift = envelope_ts.abs_diff(now);
391        if drift > skew {
392            return Err(PeerHandshakeError::ClockSkewExceeded {
393                envelope: envelope_ts,
394                local: now,
395                skew,
396            });
397        }
398
399        let pinned_peer = self.store.get(expected_remote_kernel_id)?;
400        let expected_public_key = self
401            .trusted_peers
402            .get(expected_remote_kernel_id)
403            .cloned()
404            .or_else(|| pinned_peer.as_ref().map(|peer| peer.public_key.clone()))
405            .ok_or_else(|| {
406                PeerHandshakeError::MissingTrustAnchor(expected_remote_kernel_id.to_string())
407            })?;
408        if envelope.declared_public_key != expected_public_key {
409            return Err(PeerHandshakeError::UnexpectedPeerKey {
410                kernel_id: expected_remote_kernel_id.to_string(),
411                expected: expected_public_key.to_hex(),
412                actual: envelope.declared_public_key.to_hex(),
413            });
414        }
415
416        let peer = FederationPeer {
417            kernel_id: expected_remote_kernel_id.to_string(),
418            public_key: envelope.declared_public_key.clone(),
419            established_at: now,
420            rotation_due: now.saturating_add(self.config.rotation_window_secs),
421        };
422        self.store.insert(peer.clone())?;
423        Ok(peer)
424    }
425
426    /// Resolve a pinned peer, refusing stale pins fail-closed.
427    pub fn resolve(&self, kernel_id: &str, now: u64) -> Result<FederationPeer, PeerHandshakeError> {
428        let Some(peer) = self.store.get(kernel_id)? else {
429            return Err(PeerHandshakeError::PeerNotPinned(kernel_id.to_string()));
430        };
431        if !peer.is_fresh(now) {
432            return Err(PeerHandshakeError::PeerStale(kernel_id.to_string()));
433        }
434        Ok(peer)
435    }
436
437    /// Remove a pinned peer without waiting for rotation.
438    pub fn forget(&self, kernel_id: &str) -> Result<Option<FederationPeer>, PeerHandshakeError> {
439        self.store.remove(kernel_id)
440    }
441
442    /// Snapshot of all currently-pinned peers. Order is unspecified.
443    pub fn peers(&self) -> Result<Vec<FederationPeer>, PeerHandshakeError> {
444        self.store.snapshot()
445    }
446}