Skip to main content

koi_certmesh/
lib.rs

1//! Koi Certmesh - certificate mesh with pluggable enrollment auth (Phase 2+).
2//!
3//! Provides a private Certificate Authority that mints ECDSA P-256 certificates,
4//! pluggable enrollment authentication (TOTP), trust store installation,
5//! and a roster of enrolled members. Two machines on the same LAN can establish
6//! mutual TLS trust without external infrastructure.
7
8pub mod acme;
9pub mod audit;
10pub mod backup;
11pub mod bundle;
12pub mod ca;
13pub mod certfiles;
14pub mod certmesh_paths;
15pub mod client;
16#[cfg(test)]
17mod conformance;
18pub mod csr;
19pub mod diagnosis;
20pub mod enrollment;
21pub mod entropy;
22pub mod envelope;
23pub mod error;
24pub mod failover;
25pub mod health;
26pub mod http;
27pub mod init_ceremony;
28pub mod invite;
29pub mod lifecycle;
30pub mod member;
31pub mod mtls;
32pub mod profiles;
33pub mod protocol;
34pub mod roster;
35pub mod sealed;
36pub mod serve;
37pub mod wordlist;
38
39pub use certmesh_paths::CertmeshPaths;
40
41use std::sync::Arc;
42
43use axum::Router;
44use koi_common::capability::{Capability, CapabilityStatus};
45use koi_common::posture::Posture;
46use koi_crypto::auth::AuthState;
47use koi_crypto::totp::RateLimiter;
48use tokio::sync::{broadcast, mpsc, oneshot, watch};
49use zeroize::Zeroizing;
50
51pub use client::PeerClient;
52pub use csr::sign_csr;
53pub use error::CertmeshError;
54use roster::Roster;
55
56/// mDNS service type for CA discovery.
57/// Used by the binary crate to announce the CA via koi-mdns.
58pub const CERTMESH_SERVICE_TYPE: &str = "_certmesh._tcp";
59
60/// Events emitted by the certmesh subsystem.
61#[derive(Debug, Clone)]
62pub enum CertmeshEvent {
63    /// A new member was enrolled in the mesh.
64    MemberJoined {
65        hostname: String,
66        fingerprint: String,
67    },
68    /// A member was revoked from the mesh.
69    MemberRevoked { hostname: String },
70    /// All certmesh state was destroyed.
71    Destroyed,
72    /// This node's leaf certificate was renewed successfully (ADR-020 reactive plane).
73    CertRenewed {
74        /// When the new leaf expires (RFC 3339).
75        expires_at: chrono::DateTime<chrono::Utc>,
76    },
77    /// The leaf will expire soon; renewal is overdue. Fires each time the renewal
78    /// loop skips (CA unreachable) while the leaf is past its `renew_threshold`.
79    CertExpiringSoon {
80        /// Whole days until expiry (may be 0 or negative if already expired).
81        days_left: i64,
82    },
83    /// A renewal attempt failed. `consecutive_failures` lets a consumer decide
84    /// when to alert vs. absorb a transient CA hiccup.
85    CertRenewalFailed {
86        /// Human-readable reason from the renewal outcome.
87        reason: String,
88        /// How many consecutive failures (including this one).
89        consecutive_failures: u32,
90    },
91    /// A trust-bundle pull updated the roster or policy, or confirmed revocation.
92    BundleUpdated {
93        /// `true` when the bundle explicitly listed this node as revoked — the node
94        /// should stop serving and surface a clear error (ADR-020 §revocation).
95        self_revoked: bool,
96    },
97}
98
99// ── Internal shared state ───────────────────────────────────────────
100
101/// Internal shared state for CertmeshCore and HTTP handlers.
102/// Not exposed outside this crate - all access goes through CertmeshCore methods.
103pub(crate) struct CertmeshState {
104    /// Resolved filesystem paths (immutable after construction).
105    pub(crate) paths: CertmeshPaths,
106    pub(crate) ca: tokio::sync::Mutex<Option<ca::CaState>>,
107    pub(crate) roster: tokio::sync::Mutex<Roster>,
108    pub(crate) auth: tokio::sync::Mutex<Option<AuthState>>,
109    pub(crate) pending_challenge: tokio::sync::Mutex<Option<koi_crypto::auth::AuthChallenge>>,
110    pub(crate) rate_limiter: tokio::sync::Mutex<RateLimiter>,
111    pub(crate) approval_tx: tokio::sync::Mutex<Option<mpsc::Sender<ApprovalRequest>>>,
112    pub(crate) event_tx: broadcast::Sender<CertmeshEvent>,
113    /// Latest node posture, published on every identity-mutating op so a listener
114    /// supervisor (ADR-020 §5) can react to Open↔Authenticated transitions without
115    /// polling. Seeded from disk at construction; coalesced (no-op when unchanged).
116    pub(crate) posture_tx: watch::Sender<Posture>,
117    /// Tracks consecutive renewal failures so `CertRenewalFailed` can report the
118    /// streak to consumers. Reset to zero on each successful renewal.
119    pub(crate) renewal_failure_count: std::sync::atomic::AtomicU32,
120}
121
122/// Enrollment approval request sent to the operator prompt.
123#[derive(Debug)]
124pub struct ApprovalRequest {
125    pub hostname: String,
126    /// Whether this mesh requires operator approval (carries the operator name
127    /// requirement that the old `profile` flag used to encode).
128    pub requires_approval: bool,
129    pub respond_to: oneshot::Sender<ApprovalDecision>,
130}
131
132/// Enrollment approval decision from the operator prompt.
133#[derive(Debug)]
134pub enum ApprovalDecision {
135    Approved { operator: Option<String> },
136    Denied,
137}
138
139const APPROVAL_TIMEOUT_SECS: u64 = 300;
140
141/// Hard ceiling on a single member-pull renewal request (connect + handshake +
142/// request + body). Bounds a black-holed CA so the renewal loop and daemon
143/// shutdown never wait on the OS TCP timeout.
144const RENEWAL_REQUEST_TIMEOUT: std::time::Duration = std::time::Duration::from_secs(30);
145
146/// Result of daemon self-enrollment for the mTLS listener.
147///
148/// Contains all PEM material needed to configure TLS with client cert
149/// verification. Cloneable so the same leaf can configure both the mTLS and the
150/// ACME server-auth listeners.
151#[derive(Clone)]
152pub struct SelfEnrollment {
153    /// The daemon's certificate (signed by the CA).
154    pub cert_pem: String,
155    /// The daemon's private key.
156    pub key_pem: String,
157    /// The CA certificate (for client verification).
158    pub ca_cert_pem: String,
159}
160
161/// This node's live cryptographic identity (ADR-020 §7): its CA-signed leaf plus
162/// the CA anchor it chains to. The unified replacement for the previously
163/// fragmented [`SelfEnrollment`] (cert/key/CA, no hostname) and
164/// [`member::MemberState`] (CA coordinates, no cert). Returned by
165/// [`CertmeshCore::local_identity`] and `ensure_identity`.
166///
167/// Cloneable so the same leaf can configure multiple listeners/clients. `Debug`
168/// is redacted — the private key is never logged.
169#[derive(Clone)]
170pub struct Identity {
171    /// This node's hostname (its certificate CN / cert directory name).
172    pub hostname: String,
173    /// The node's leaf certificate (PEM), signed by the CA.
174    pub cert_pem: String,
175    /// The node's private key (PEM). Never logged (redacted `Debug`).
176    pub key_pem: String,
177    /// The CA root certificate (PEM) the leaf chains to.
178    pub ca_cert_pem: String,
179    /// SHA-256 (hex) of the CA cert DER — the pin peers verify against.
180    pub ca_fingerprint: String,
181    /// Renewal/expiry health of the leaf (ADR-020 §13: "loud, not silent").
182    pub renewal: RenewalHealth,
183}
184
185impl std::fmt::Debug for Identity {
186    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
187        f.debug_struct("Identity")
188            .field("hostname", &self.hostname)
189            .field("ca_fingerprint", &self.ca_fingerprint)
190            .field("renewal", &self.renewal)
191            .field("cert_pem", &"<redacted>")
192            .field("key_pem", &"<redacted>")
193            .field("ca_cert_pem", &"<redacted>")
194            .finish()
195    }
196}
197
198/// Derived renewal/expiry health of a leaf certificate (ADR-020 §13).
199///
200/// The schedule facts a node and operator need so identity expiry is never a
201/// silent surprise: when the leaf expires, when renewal is due, and whether it is
202/// overdue or already expired. Attempt-level fields (last attempt, failure streak)
203/// are wired by the renewal loop in a later increment.
204#[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize, utoipa::ToSchema)]
205pub struct RenewalHealth {
206    /// When the current leaf expires.
207    pub expires_at: chrono::DateTime<chrono::Utc>,
208    /// When renewal becomes due (`expires_at` − `renew_threshold_days`).
209    pub next_renewal_at: chrono::DateTime<chrono::Utc>,
210    /// Whole days until expiry (negative once expired).
211    pub expires_in_days: i64,
212    /// At/past the renewal point but the leaf has not yet rotated.
213    pub renew_overdue: bool,
214    /// At/past expiry — renewal failed or never ran.
215    pub expired: bool,
216}
217
218impl RenewalHealth {
219    /// Derive health from a leaf cert PEM and the CA-held policy. `None` when the
220    /// certificate's validity window cannot be parsed.
221    fn from_leaf(cert_pem: &str, policy: &roster::CertPolicy) -> Option<Self> {
222        let expires_at = leaf_not_after_utc(cert_pem)?;
223        let next_renewal_at =
224            expires_at - chrono::Duration::days(i64::from(policy.renew_threshold_days));
225        let now = chrono::Utc::now();
226        Some(Self {
227            expires_at,
228            next_renewal_at,
229            expires_in_days: (expires_at - now).num_days(),
230            renew_overdue: now >= next_renewal_at,
231            expired: now >= expires_at,
232        })
233    }
234}
235
236/// Serializable, key-redacting projection of [`Identity`] for cross-process and
237/// cross-language consumers (ADR-020 reactive plane / wishlist 5.3).
238///
239/// The private key and all raw PEM material are omitted — only the non-sensitive
240/// scheduling and anchor facts that a consumer needs to surface "who is this node
241/// and when does its identity expire?" without leaking key material.
242#[derive(Debug, Clone, serde::Serialize, serde::Deserialize, utoipa::ToSchema)]
243pub struct IdentityInfo {
244    /// This node's hostname (its certificate CN).
245    pub hostname: String,
246    /// SHA-256 (hex) of the CA cert DER — the mesh anchor the peer pins to.
247    pub ca_fingerprint: String,
248    /// Renewal and expiry schedule.
249    pub renewal: RenewalHealth,
250}
251
252impl From<&Identity> for IdentityInfo {
253    fn from(id: &Identity) -> Self {
254        Self {
255            hostname: id.hostname.clone(),
256            ca_fingerprint: id.ca_fingerprint.clone(),
257            renewal: id.renewal.clone(),
258        }
259    }
260}
261
262/// The posture watch seeded from disk: a node is `signed` when it already holds a
263/// usable CA-anchored leaf. Used by every `CertmeshState` constructor so the watch
264/// reports the right value before any mutation (ADR-020 §5).
265fn initial_posture_tx(paths: &CertmeshPaths) -> watch::Sender<Posture> {
266    watch::channel(Posture {
267        signed: node_has_identity(paths),
268        encrypted: false,
269    })
270    .0
271}
272
273impl CertmeshState {
274    /// Recompute this node's posture from disk and publish it on the watch
275    /// (ADR-020 §5). Coalesced — a `send` (and thus a `PostureChanged`) fires only
276    /// when the posture actually changed. Called after every identity-mutating op
277    /// (create / self-enroll / member install / destroy).
278    pub(crate) fn republish_posture(&self) {
279        let next = Posture {
280            signed: node_has_identity(&self.paths),
281            encrypted: false,
282        };
283        self.posture_tx.send_if_modified(|cur| {
284            if *cur != next {
285                *cur = next;
286                true
287            } else {
288                false
289            }
290        });
291    }
292
293    /// Destroy all certmesh state - shared by CertmeshCore::destroy() and the HTTP handler.
294    pub(crate) async fn destroy(&self) -> Result<(), CertmeshError> {
295        // Clear in-memory state first
296        *self.ca.lock().await = None;
297        *self.auth.lock().await = None;
298        *self.pending_challenge.lock().await = None;
299        *self.roster.lock().await = Roster::empty();
300
301        // Remove platform-sealed key material (best-effort)
302        if let Err(e) = koi_crypto::tpm::delete_key_material("koi-certmesh-ca") {
303            tracing::debug!(error = %e, "No platform-sealed key material to clean up");
304        }
305
306        // Filesystem cleanup via spawn_blocking to avoid blocking the async executor
307        let certmesh_dir = self.paths.certmesh_dir();
308        let certs_dir = self.paths.certs_dir();
309        let audit_path = self.paths.audit_log_path();
310        tokio::task::spawn_blocking(move || {
311            if certmesh_dir.exists() {
312                if let Err(e) = std::fs::remove_dir_all(&certmesh_dir) {
313                    tracing::warn!(error = %e, "Failed to remove certmesh directory");
314                } else {
315                    tracing::info!(path = %certmesh_dir.display(), "Certmesh data directory removed");
316                }
317            }
318            if certs_dir.exists() {
319                if let Err(e) = std::fs::remove_dir_all(&certs_dir) {
320                    tracing::warn!(error = %e, "Failed to remove certificate files");
321                } else {
322                    tracing::info!(path = %certs_dir.display(), "Certificate files removed");
323                }
324            }
325            if audit_path.exists() {
326                if let Err(e) = std::fs::remove_file(&audit_path) {
327                    tracing::warn!(error = %e, "Failed to remove audit log");
328                } else {
329                    tracing::info!(path = %audit_path.display(), "Audit log removed");
330                }
331            }
332        })
333        .await
334        .map_err(|e| CertmeshError::Internal(format!("destroy task: {e}")))?;
335
336        tracing::info!("Certmesh state destroyed");
337        self.republish_posture();
338        Ok(())
339    }
340
341    /// Single-writer commit of a **membership** change (ADR-017 F8).
342    ///
343    /// Holds the roster lock for the entire read-modify-write, bumps `seq`, and
344    /// persists atomically *while still holding the lock* — so concurrent commits
345    /// serialize in `seq` order and can never lose an update (the old
346    /// `clone → drop → write` pattern could). Persists only when `mutate` returns
347    /// `Ok`; the closure must not leave the roster mutated on `Err`.
348    pub(crate) async fn commit_roster<F, R>(&self, mutate: F) -> Result<R, CertmeshError>
349    where
350        F: FnOnce(&mut Roster) -> Result<R, CertmeshError>,
351    {
352        self.commit_inner(true, mutate).await
353    }
354
355    /// Persist a **non-membership** change (e.g. `last_seen`) without bumping
356    /// `seq`, still holding the lock across the atomic write. The trust bundle is
357    /// unaffected (it does not carry liveness), so its `seq`/cache stay stable.
358    pub(crate) async fn touch_roster<F, R>(&self, mutate: F) -> Result<R, CertmeshError>
359    where
360        F: FnOnce(&mut Roster) -> Result<R, CertmeshError>,
361    {
362        self.commit_inner(false, mutate).await
363    }
364
365    async fn commit_inner<F, R>(&self, bump_seq: bool, mutate: F) -> Result<R, CertmeshError>
366    where
367        F: FnOnce(&mut Roster) -> Result<R, CertmeshError>,
368    {
369        let mut roster = self.roster.lock().await;
370        let out = mutate(&mut roster)?;
371        if bump_seq {
372            roster.metadata.seq = roster.metadata.seq.saturating_add(1);
373        }
374        let snapshot = roster.clone();
375        let path = self.paths.roster_path();
376        // Persist off the executor but keep the roster lock held so writes
377        // serialize in seq order (single writer).
378        let saved = tokio::task::spawn_blocking(move || roster::save_roster(&snapshot, &path))
379            .await
380            .map_err(|e| std::io::Error::other(format!("roster save task: {e}")))
381            .and_then(|r| r)
382            .map_err(CertmeshError::Io);
383        if let Err(e) = saved {
384            // A failed persist is a trust-relevant event: the in-memory roster
385            // advanced but the durable copy did not (ADR-017 F9). Audit before
386            // returning so the gap is visible.
387            let _ = audit::append_entry_to(
388                &self.paths.audit_log_path(),
389                "roster_persist_failed",
390                &[("error", &e.to_string())],
391            );
392            return Err(e);
393        }
394        Ok(out)
395    }
396}
397
398// ── CertmeshCore - domain facade ────────────────────────────────────
399
400/// CertmeshCore - the main domain facade.
401///
402/// Wraps the shared certmesh state and exposes commands,
403/// status, and HTTP routes to the binary crate.
404///
405/// `Clone` is a cheap `Arc` bump — every clone shares the same underlying
406/// `CertmeshState` (CA, roster, auth). This lets the composition layer hold a
407/// facade while also building an `AcmeState` over the same state.
408#[derive(Clone)]
409pub struct CertmeshCore {
410    state: Arc<CertmeshState>,
411}
412
413// impl CertmeshCore is split across cohesive submodules (certmesh M2).
414// Each child module does 'use super::*' to inherit lib.rs's imports, sibling
415// modules, and crate-private state + helpers.
416mod core_admin;
417mod core_auth;
418mod core_enroll;
419mod core_identity;
420mod core_lifecycle;
421mod core_member;
422mod core_renewal;
423mod core_setup;
424
425/// Shell metacharacters forbidden in reload hook commands.
426///
427/// Single source of truth for hook-command validation (the HTTP handler
428/// delegates to [`CertmeshCore::set_reload_hook`], which calls
429/// [`validate_reload_hook`]).
430const HOOK_FORBIDDEN: &[char] = &[
431    ';', '|', '&', '$', '`', '>', '<', '(', ')', '\n', '\r', '\0', '*', '?', '[', ']', '{', '}',
432    '~', '%', '!',
433];
434
435/// Validate a post-renewal reload hook command.
436///
437/// This is the **single source of truth** for hook validation — every caller
438/// (HTTP, embedded, CLI) is protected because they all route through
439/// [`CertmeshCore::set_reload_hook`], which calls this. The validation is the
440/// superset of all prior checks:
441///
442/// 1. No shell metacharacters ([`HOOK_FORBIDDEN`]).
443/// 2. The command must be an **absolute path** — on Unix it must start with
444///    `/`; on Windows it must begin with a drive-letter path (`X:\…`) or UNC
445///    path (`\\…`). This blocks `PATH`-relative command injection.
446pub(crate) fn validate_reload_hook(hook: &str) -> Result<(), CertmeshError> {
447    if hook.contains(HOOK_FORBIDDEN) {
448        return Err(CertmeshError::InvalidPayload(
449            "reload hook contains forbidden characters".into(),
450        ));
451    }
452    #[cfg(unix)]
453    if !hook.starts_with('/') {
454        return Err(CertmeshError::InvalidPayload(
455            "reload hook must be an absolute path".into(),
456        ));
457    }
458    #[cfg(windows)]
459    {
460        let bytes = hook.as_bytes();
461        let drive_letter = bytes.len() >= 3 && bytes[1] == b':';
462        let unc = hook.starts_with("\\\\");
463        if !(drive_letter || unc) {
464            return Err(CertmeshError::InvalidPayload(
465                "reload hook must be an absolute path".into(),
466            ));
467        }
468    }
469    Ok(())
470}
471
472/// Outcome of a member trust-bundle pull ([`CertmeshCore::pull_trust_bundle`]).
473#[derive(Debug)]
474pub enum BundleOutcome {
475    /// This node has no member state — it never joined a mesh. Nothing to pull.
476    NotApplicable,
477    /// The bundle verified but its `seq` matches what we already have.
478    NoChange { seq: u64 },
479    /// A newer, verified bundle was accepted; policy + `last_bundle_seq` updated.
480    Updated { seq: u64, self_revoked: bool },
481}
482
483/// Outcome of a member-pull renewal attempt ([`CertmeshCore::renew_self_if_due`]).
484#[derive(Debug)]
485pub enum RenewOutcome {
486    /// This node has no member renewal state — it never joined a mesh (e.g. it is
487    /// the CA, or unconfigured). Nothing to do.
488    NotApplicable,
489    /// The local leaf is not yet within the renewal threshold.
490    NotDue {
491        not_after: chrono::DateTime<chrono::Utc>,
492    },
493    /// The leaf was renewed (key rotated); carries the new expiry and any reload
494    /// hook result.
495    Renewed {
496        expires: String,
497        hook: Option<protocol::HookResult>,
498    },
499}
500
501/// Parse a leaf certificate PEM and return its `not_after` as a UTC datetime.
502///
503/// Returns `None` on unparseable PEM/DER or an out-of-range timestamp.
504/// Whether a node rooted at `paths` holds a usable local identity: a CA-signed
505/// leaf (`cert.pem`/`key.pem`) for the local hostname on disk, anchored to a mesh
506/// (the CA is initialized here, or a `member.json` records the joined mesh — so an
507/// orphaned leaf left by `destroy` does not read as secure). Backs
508/// [`CertmeshCore::posture`] and the [`CertmeshCore::require_auth`] gate.
509pub(crate) fn node_has_identity(paths: &CertmeshPaths) -> bool {
510    let Some(hostname) = CertmeshCore::local_hostname() else {
511        return false;
512    };
513    let leaf = paths.certs_dir().join(&hostname);
514    let leaf_present = leaf.join("cert.pem").exists() && leaf.join("key.pem").exists();
515    let anchored = paths.is_ca_initialized() || paths.member_state_path().exists();
516    leaf_present && anchored
517}
518
519fn leaf_not_after_utc(cert_pem: &str) -> Option<chrono::DateTime<chrono::Utc>> {
520    use x509_parser::prelude::FromDer;
521    let der = pem::parse(cert_pem).ok()?;
522    let (_, cert) = x509_parser::certificate::X509Certificate::from_der(der.contents()).ok()?;
523    chrono::DateTime::from_timestamp(cert.validity().not_after.timestamp(), 0)
524}
525
526/// Write `bytes` to `path` atomically (temp file → rename), 0600 on Unix when
527/// `private` is set. Used by the member-pull renewal install so a crash mid-write
528/// can never leave a half-written key or cert in place. The temp name carries the
529/// pid so concurrent writers (different processes) never collide on it.
530fn write_file_atomic(path: &std::path::Path, bytes: &[u8], private: bool) -> std::io::Result<()> {
531    let tmp = path.with_extension(format!("tmp.{}", std::process::id()));
532    std::fs::write(&tmp, bytes)?;
533    #[cfg(unix)]
534    if private {
535        use std::os::unix::fs::PermissionsExt;
536        std::fs::set_permissions(&tmp, std::fs::Permissions::from_mode(0o600))?;
537    }
538    #[cfg(not(unix))]
539    let _ = private;
540    std::fs::rename(&tmp, path)?;
541    Ok(())
542}
543
544/// Whether the recorded machine binding still matches this host (ADR-017 F11).
545///
546/// `true` when no binding was recorded (a pre-F11 CA — not machine-checked) or
547/// when the recorded fingerprint matches the current host. `false` only when a
548/// recorded binding no longer matches, or can't be re-derived — both of which
549/// must fail auto-unlock safe (boot locked).
550///
551/// Free function (not a method) so the daemon boot path
552/// (`koi_compose::init_certmesh_core`, which builds the core *after* deciding
553/// whether to auto-unlock) can gate on it with only the resolved paths. It does
554/// blocking I/O (a file read; a subprocess on Windows/macOS) — call it from a sync
555/// context or via `spawn_blocking`.
556pub fn machine_binding_ok(paths: &CertmeshPaths) -> bool {
557    let recorded = match std::fs::read_to_string(paths.machine_bind_path()) {
558        Ok(s) => s.trim().to_string(),
559        Err(_) => return true, // no binding recorded → not machine-checked
560    };
561    match koi_crypto::vault::machine_fingerprint() {
562        Some(current) => koi_crypto::pinning::fingerprints_match(&current, &recorded),
563        None => false, // recorded a binding but machine-id is now unreadable → fail safe
564    }
565}
566
567/// Write the machine-binding fingerprint atomically (0600 on Unix), creating the
568/// parent directory if needed (ADR-017 F11). The value is a non-secret hash.
569fn write_machine_binding(path: &std::path::Path, fingerprint: &str) -> std::io::Result<()> {
570    if let Some(parent) = path.parent() {
571        std::fs::create_dir_all(parent)?;
572    }
573    write_file_atomic(path, fingerprint.as_bytes(), true)
574}
575
576/// Load the persisted TOTP rate-limiter state, or a fresh one (ADR-017 F7).
577///
578/// A missing or unparseable file yields a fresh limiter; the live check still
579/// fails closed, and a real lockout is re-persisted on the next failed attempt.
580fn load_rate_limiter(paths: &CertmeshPaths) -> RateLimiter {
581    match std::fs::read(paths.rate_limiter_path()) {
582        Ok(bytes) => serde_json::from_slice(&bytes).unwrap_or_else(|e| {
583            tracing::warn!(error = %e, "Could not parse persisted rate-limiter; starting fresh");
584            RateLimiter::new()
585        }),
586        Err(_) => RateLimiter::new(),
587    }
588}
589
590/// Persist the TOTP rate-limiter state atomically (0600) so a daemon restart can't
591/// reset an active lockout (ADR-017 F7). Best-effort — callers log any error.
592/// `pub(crate)` so the http promote handler can persist after its own check.
593pub(crate) fn persist_rate_limiter(
594    paths: &CertmeshPaths,
595    limiter: &RateLimiter,
596) -> std::io::Result<()> {
597    let path = paths.rate_limiter_path();
598    if let Some(parent) = path.parent() {
599        std::fs::create_dir_all(parent)?;
600    }
601    let json = serde_json::to_vec(limiter).map_err(std::io::Error::other)?;
602    write_file_atomic(&path, &json, true)
603}
604
605/// The single source of truth for hostname validation (ADR-017 F15): full
606/// **RFC 1123**, used everywhere a hostname becomes a certificate SAN/CN or a
607/// directory name under `certs/`.
608///
609/// Rules: total length 1..=253; one or more dot-separated labels; each label
610/// 1..=63 chars of ASCII alphanumeric or hyphen, with no leading or trailing
611/// hyphen. This subsumes the old per-call-site denylists — path separators (`/`
612/// `\`), `..`, `:`, NUL, and spaces are all rejected by construction, so a
613/// validated hostname is safe both as a SAN and as a single-segment directory
614/// name (it can never escape the certs directory).
615pub(crate) fn validate_hostname(hostname: &str) -> Result<(), CertmeshError> {
616    let reject = |msg: String| Err(CertmeshError::InvalidPayload(msg));
617    if hostname.is_empty() || hostname.len() > 253 {
618        return reject(format!(
619            "hostname length must be 1..=253 characters: {hostname:?}"
620        ));
621    }
622    for label in hostname.split('.') {
623        if label.is_empty() || label.len() > 63 {
624            return reject(format!(
625                "hostname label length must be 1..=63 characters: {hostname:?}"
626            ));
627        }
628        if !label
629            .bytes()
630            .all(|b| b.is_ascii_alphanumeric() || b == b'-')
631        {
632            return reject(format!(
633                "hostname has invalid characters (RFC 1123 allows alphanumerics + hyphen): {hostname:?}"
634            ));
635        }
636        if label.starts_with('-') || label.ends_with('-') {
637            return reject(format!(
638                "hostname label must not start or end with a hyphen: {hostname:?}"
639            ));
640        }
641    }
642    Ok(())
643}
644
645/// Decode a hex string into bytes. Returns `None` on invalid hex or odd length.
646fn decode_hex(hex: &str) -> Option<Vec<u8>> {
647    if !hex.len().is_multiple_of(2) {
648        return None;
649    }
650    (0..hex.len())
651        .step_by(2)
652        .map(|i| u8::from_str_radix(&hex[i..i + 2], 16).ok())
653        .collect()
654}
655
656async fn request_approval(
657    state: &CertmeshState,
658    hostname: &str,
659    requires_approval: bool,
660) -> Result<Option<String>, CertmeshError> {
661    let tx = state
662        .approval_tx
663        .lock()
664        .await
665        .clone()
666        .ok_or(CertmeshError::ApprovalUnavailable)?;
667
668    let (respond_to, response_rx) = oneshot::channel();
669    let request = ApprovalRequest {
670        hostname: hostname.to_string(),
671        requires_approval,
672        respond_to,
673    };
674
675    if tx.send(request).await.is_err() {
676        return Err(CertmeshError::ApprovalUnavailable);
677    }
678
679    let decision = match tokio::time::timeout(
680        std::time::Duration::from_secs(APPROVAL_TIMEOUT_SECS),
681        response_rx,
682    )
683    .await
684    {
685        Ok(Ok(decision)) => decision,
686        Ok(Err(_)) => return Err(CertmeshError::ApprovalUnavailable),
687        Err(_) => return Err(CertmeshError::ApprovalTimeout),
688    };
689
690    match decision {
691        ApprovalDecision::Approved { operator } => {
692            // When approval is required, an operator name must accompany it
693            // (the audit trail needs an accountable name).
694            if requires_approval && operator.as_deref().unwrap_or("").is_empty() {
695                return Err(CertmeshError::ApprovalDenied);
696            }
697            Ok(operator)
698        }
699        ApprovalDecision::Denied => Err(CertmeshError::ApprovalDenied),
700    }
701}
702
703#[async_trait::async_trait]
704impl Capability for CertmeshCore {
705    fn name(&self) -> &str {
706        "certmesh"
707    }
708
709    async fn status(&self) -> CapabilityStatus {
710        // Use try_lock for sync Capability trait - best effort
711        let ca_initialized = self.state.paths.is_ca_initialized();
712        let ca_locked = self
713            .state
714            .ca
715            .try_lock()
716            .map(|guard| guard.is_none())
717            .unwrap_or(true);
718        let member_count = self
719            .state
720            .roster
721            .try_lock()
722            .map(|guard| guard.active_count())
723            .unwrap_or(0);
724
725        let (summary, healthy) = if !ca_initialized {
726            ("ready \u{2014} run certmesh create".to_string(), true)
727        } else if ca_locked {
728            ("CA locked".to_string(), false)
729        } else {
730            (
731                format!(
732                    "active ({} member{})",
733                    member_count,
734                    if member_count == 1 { "" } else { "s" }
735                ),
736                true,
737            )
738        };
739
740        CapabilityStatus {
741            name: "certmesh".to_string(),
742            summary,
743            healthy,
744        }
745    }
746}
747
748// ── Shared helpers ──────────────────────────────────────────────────
749
750/// Build a CertmeshStatus from locked guards. Used by both the facade
751/// method and the HTTP handler to avoid duplicating the mapping logic.
752pub(crate) fn build_status(
753    paths: &CertmeshPaths,
754    ca_guard: &Option<ca::CaState>,
755    roster: &Roster,
756    auth_method: Option<&str>,
757) -> protocol::CertmeshStatus {
758    let ca_fingerprint = match ca_guard {
759        Some(ca) => Some(ca::ca_fingerprint(ca)),
760        None => ca::ca_fingerprint_from_disk(paths).ok(),
761    };
762
763    protocol::CertmeshStatus {
764        ca_initialized: paths.is_ca_initialized(),
765        ca_locked: ca_guard.is_none(),
766        ca_fingerprint,
767        enrollment_open: roster.metadata.enrollment_open,
768        requires_approval: roster.metadata.requires_approval,
769        enrollment_state: roster.enrollment_state(),
770        auth_method: auth_method.map(|s| s.to_string()),
771        member_count: roster.active_count(),
772        seq: roster.metadata.seq,
773        policy: roster.metadata.policy.clone(),
774        members: roster
775            .members
776            .iter()
777            .map(|m| protocol::MemberSummary {
778                hostname: m.hostname.clone(),
779                role: format!("{:?}", m.role).to_lowercase(),
780                status: format!("{:?}", m.status).to_lowercase(),
781                cert_fingerprint: m.cert_fingerprint.clone(),
782                cert_expires: m.cert_expires.to_rfc3339(),
783            })
784            .collect(),
785    }
786}
787
788#[cfg(test)]
789mod core_tests;