Skip to main content

koi_compose/
cores.rs

1//! Daemon core composition — the single place that constructs every domain core, wires the
2//! cross-domain integration bridges between them, spawns the domain background tasks
3//! (orchestrator + certmesh role loops), and tears it all down in order.
4//!
5//! Before P07 this graph was hand-written twice — in the binary's `daemon_mode` and again
6//! in the Windows service's `run_service` — and the two had already drifted into a verified
7//! `koi install` defect. [`build_cores`] is now the one copy both call, so the daemon they
8//! construct is identical by construction.
9//!
10//! The enrollment-approval pump is intentionally *not* spawned here: its decider differs by
11//! host (the foreground daemon prompts on stdin; consoleless hosts deny). The caller spawns
12//! it via [`crate::certmesh::spawn_enrollment_approval`].
13
14use std::path::Path;
15use std::sync::Arc;
16use std::time::Duration;
17
18use tokio::task::JoinHandle;
19use tokio_util::sync::CancellationToken;
20
21use koi_common::integration::{
22    AliasFeedback, CertmeshSnapshot, DnsProbe, MdnsSnapshot, ProxySnapshot,
23};
24
25/// The set of domain cores a daemon runs. Each is present only if its capability is enabled
26/// (via the `no_*` flags in [`CoreSpec`]).
27#[derive(Clone, Default)]
28pub struct Cores {
29    pub mdns: Option<Arc<koi_mdns::MdnsCore>>,
30    pub certmesh: Option<Arc<koi_certmesh::CertmeshCore>>,
31    pub dns: Option<Arc<koi_dns::DnsRuntime>>,
32    pub health: Option<Arc<koi_health::HealthRuntime>>,
33    pub proxy: Option<Arc<koi_proxy::ProxyRuntime>>,
34    pub udp: Option<Arc<koi_udp::UdpRuntime>>,
35    pub runtime: Option<Arc<koi_runtime::RuntimeCore>>,
36    /// The shared mDNS cached-records snapshot bridge (same instance DNS/health
37    /// consume). Exposed so presentation adapters (e.g. the Prometheus SD endpoint's
38    /// `?include=discovered` slice) can read cached records without spawning a second
39    /// meta-browse. `None` when mDNS is disabled.
40    pub mdns_snapshot: Option<Arc<dyn MdnsSnapshot>>,
41}
42
43/// Error from [`build_cores`] when `fail_fast` is set (koi-embedded's library contract).
44/// With `fail_fast = false` (the daemon/service default) `build_cores` never returns this —
45/// a capability that fails to initialize is logged and dropped and the daemon keeps running.
46#[derive(Debug, thiserror::Error)]
47pub enum BuildCoresError {
48    #[error("mDNS core init failed: {0}")]
49    Mdns(#[from] koi_mdns::MdnsError),
50    #[error("DNS init/start failed: {0}")]
51    Dns(#[from] koi_dns::DnsError),
52    #[error("proxy init/start failed: {0}")]
53    Proxy(#[from] koi_proxy::ProxyError),
54    #[error("health start failed: {0}")]
55    Health(#[from] koi_health::HealthError),
56    #[error("certmesh init task panicked: {0}")]
57    CertmeshInit(String),
58}
59
60/// Capability flags + inputs needed to build the cores. A daemon-`Config` subset, kept here
61/// (rather than depending on the binary's `Config`) so koi-compose stays standalone.
62///
63/// The daemon and the Windows service fill it via [`CoreSpec::daemon`]; `koi-embedded` sets
64/// the embedded-only forks (data-dir-scoped proxy, pinned DNS state path, the auto-start +
65/// background-loop opt-ins) directly. Every field has a daemon default so the two boot paths
66/// build the identical graph.
67pub struct CoreSpec {
68    pub no_mdns: bool,
69    pub no_certmesh: bool,
70    pub no_dns: bool,
71    pub no_health: bool,
72    pub no_proxy: bool,
73    pub no_udp: bool,
74    pub no_runtime: bool,
75    /// Data directory for certmesh state. `None` uses the platform default (embedded leaves
76    /// it unset when the host did not pin one); the daemon always resolves a concrete dir.
77    pub data_dir: Option<std::path::PathBuf>,
78    /// DNS configuration (the caller's resolved `DnsConfig`).
79    pub dns_config: koi_dns::DnsConfig,
80    /// Runtime backend selector string ("auto", "docker", "podman", …).
81    pub runtime: String,
82    /// Daemon HTTP port (the local management/API port the daemon binds).
83    pub http_port: u16,
84    /// Override the DNS state file path (embedded pins it to its data dir to be immune to
85    /// `KOI_DATA_DIR` races in parallel tests). `None` keeps the `dns_config` value.
86    pub dns_state_path: Option<std::path::PathBuf>,
87    /// Build the proxy core scoped to this data dir (`ProxyCore::with_data_dir`). `None`
88    /// uses the platform-default proxy state (the daemon's behavior).
89    pub proxy_data_dir: Option<std::path::PathBuf>,
90    /// Start the DNS server after constructing its core. The daemon always does; embedded
91    /// gates it on `dns_auto_start`.
92    pub dns_auto_start: bool,
93    /// Start health checks after constructing the core (daemon: always; embedded: opt-in).
94    pub health_auto_start: bool,
95    /// Start the proxy listeners after constructing the core (daemon: always; embedded: opt-in).
96    pub proxy_auto_start: bool,
97    /// Spawn the runtime orchestrator when the runtime adapter is present (daemon: always;
98    /// embedded: opt-in via the `orchestrator` builder flag).
99    pub spawn_orchestrator: bool,
100    /// Spawn the certmesh role-driven background loop when certmesh is present (daemon:
101    /// always; embedded: opt-in via the `certmesh_background` builder flag).
102    pub spawn_certmesh_loops: bool,
103    /// Fail-fast contract: when `true` (koi-embedded, a library), the first core that fails to
104    /// initialize or auto-start aborts `build_cores` with [`BuildCoresError`]. When `false`
105    /// (the daemon/service), failures are logged and that capability is dropped so the daemon
106    /// keeps serving its remaining capabilities.
107    pub fail_fast: bool,
108}
109
110impl CoreSpec {
111    /// The daemon/Windows-service defaults for the embedded-fork fields: platform-default
112    /// proxy/DNS state, always start DNS/health/proxy, always spawn the orchestrator and the
113    /// certmesh loops. Spread it into a struct literal so the daemon only names the
114    /// capability flags + resolved inputs and cannot accidentally diverge from the service.
115    ///
116    /// ```ignore
117    /// CoreSpec { no_mdns, /* … */, data_dir: Some(dir), dns_config, runtime, http_port,
118    ///            ..CoreSpec::daemon_defaults() }
119    /// ```
120    pub fn daemon_defaults() -> Self {
121        Self {
122            no_mdns: false,
123            no_certmesh: false,
124            no_dns: false,
125            no_health: false,
126            no_proxy: false,
127            no_udp: false,
128            no_runtime: false,
129            data_dir: None,
130            dns_config: koi_dns::DnsConfig::default(),
131            runtime: "auto".to_string(),
132            http_port: 0,
133            dns_state_path: None,
134            proxy_data_dir: None,
135            dns_auto_start: true,
136            health_auto_start: true,
137            proxy_auto_start: true,
138            spawn_orchestrator: true,
139            spawn_certmesh_loops: true,
140            fail_fast: false,
141        }
142    }
143}
144
145/// Initialize the certmesh core, auto-unlocking from the vault when a key is present.
146///
147/// Always returns `Some` (so HTTP routes mount even before `koi certmesh create`):
148/// - CA not initialized → an uninitialized core (routes reachable for `/create`);
149/// - CA initialized + a vault auto-unlock key present → booted **already unlocked**,
150///   collapsing the old "create locked → read key → unlock" three-step;
151/// - CA initialized + no key (or decryption fails) → a locked core.
152///
153/// This is the converged single definition shared by the daemon, the Windows service, and
154/// koi-embedded (the daemon path thereby gains the vault auto-unlock embedded already had).
155pub fn init_certmesh_core(data_dir: Option<&Path>) -> Option<Arc<koi_certmesh::CertmeshCore>> {
156    // Composition root: resolve the data dir once (Some -> injected dir, None -> the one
157    // default) and thread it into every branch so a custom data_dir is honoured end-to-end,
158    // including the early returns.
159    let paths = koi_certmesh::CertmeshPaths::with_data_dir(
160        koi_common::paths::koi_data_dir_with_override(data_dir),
161    );
162    if !paths.is_ca_initialized() {
163        tracing::info!("Certmesh: CA not initialized - routes mounted for /create");
164        return Some(Arc::new(
165            koi_certmesh::CertmeshCore::uninitialized_with_paths(paths),
166        ));
167    }
168
169    let roster_path = paths.roster_path();
170    let roster = match koi_certmesh::roster::load_roster(&roster_path) {
171        Ok(r) => r,
172        Err(e) => {
173            tracing::warn!(error = %e, "Failed to load certmesh roster - using uninitialized state");
174            return Some(Arc::new(
175                koi_certmesh::CertmeshCore::uninitialized_with_paths(paths),
176            ));
177        }
178    };
179
180    // ── F11 machine binding: refuse auto-unlock on a changed host ───────
181    // Checked BEFORE reading the vault key so a VM clone / disk restore onto new
182    // hardware (different machine-id) boots LOCKED instead of auto-unlocking with
183    // the copied vault key. Fail-safe + audited; a legitimate migration recovers
184    // with a one-time manual `koi certmesh unlock`.
185    let machine_ok = koi_certmesh::machine_binding_ok(&paths);
186    if !machine_ok {
187        let _ = koi_certmesh::audit::append_entry_to(
188            &paths.audit_log_path(),
189            "auto_unlock_refused_machine_changed",
190            &[],
191        );
192        tracing::error!(
193            "Certmesh: machine fingerprint changed since CA creation (clone/restore?) — \
194             booting LOCKED; run `koi certmesh unlock` to unlock manually on this host"
195        );
196    }
197
198    // ── Auto-unlock at init: single source of truth ─────────────
199    // The auto-unlock passphrase lives in the koi-crypto vault (written by
200    // CertmeshCore::save_auto_unlock_key_at, which deletes any legacy plaintext file).
201    // Retrieve it through the domain crate so this boot path can never drift from where the
202    // key is actually stored. When a key is present, boot the core already unlocked.
203    if machine_ok {
204        if let Ok(Some(pp)) = koi_certmesh::CertmeshCore::read_auto_unlock_key(&paths) {
205            match koi_certmesh::ca::load_ca(&pp, &paths) {
206                Ok(ca_state) => {
207                    // Reload roster (fresh copy for the new Arc)
208                    if let Ok(fresh_roster) = koi_certmesh::roster::load_roster(&roster_path) {
209                        let auth_path = paths.auth_path();
210                        let auth = if auth_path.exists() {
211                            std::fs::read_to_string(&auth_path)
212                                .ok()
213                                .and_then(|json| {
214                                    serde_json::from_str::<koi_crypto::auth::StoredAuth>(&json).ok()
215                                })
216                                .and_then(|stored| stored.unlock(&pp).ok())
217                        } else {
218                            None
219                        };
220
221                        tracing::info!("Certmesh CA auto-unlocked at init from vault");
222                        return Some(Arc::new(koi_certmesh::CertmeshCore::new_with_paths(
223                            ca_state,
224                            fresh_roster,
225                            auth,
226                            paths,
227                        )));
228                    }
229                }
230                Err(e) => {
231                    tracing::warn!(
232                        error = %e,
233                        "Auto-unlock key exists in vault but CA decryption failed"
234                    );
235                }
236            }
237        }
238    }
239
240    // No auto-unlock key - boot locked
241    tracing::info!("Certmesh: CA initialized (locked, use `koi certmesh unlock` to decrypt)");
242    let core = koi_certmesh::CertmeshCore::locked_with_paths(roster, paths);
243    Some(Arc::new(core))
244}
245
246/// Build all domain cores + cross-domain bridges, then spawn the caller-invariant domain
247/// background tasks: the runtime orchestrator (when runtime is enabled) and the certmesh
248/// role loops (when certmesh is enabled). Returns the assembled [`Cores`].
249///
250/// The bridges are wired in dependency order: DNS consumes the mDNS/certmesh/alias bridges;
251/// health consumes the mDNS/DNS/certmesh/proxy bridges. Disabled capabilities pass `None`.
252pub async fn build_cores(
253    spec: &CoreSpec,
254    cancel: &CancellationToken,
255    tasks: &mut Vec<JoinHandle<()>>,
256) -> Result<Cores, BuildCoresError> {
257    // ── mDNS ──
258    let mdns_core = if !spec.no_mdns {
259        match koi_mdns::MdnsCore::with_cancel(cancel.clone()) {
260            Ok(core) => Some(Arc::new(core)),
261            Err(e) => {
262                if spec.fail_fast {
263                    return Err(e.into());
264                }
265                tracing::error!(error = %e, "Failed to initialize mDNS core");
266                None
267            }
268        }
269    } else {
270        tracing::info!("mDNS capability: disabled");
271        None
272    };
273
274    // ── Certmesh ──
275    // The CA vault auto-unlock runs an Argon2id KDF (seconds on modest hardware), so run it on
276    // a blocking thread rather than stalling the async executor — the daemon gains this too.
277    let certmesh_core = if !spec.no_certmesh {
278        let data_dir = spec.data_dir.clone();
279        match tokio::task::spawn_blocking(move || init_certmesh_core(data_dir.as_deref())).await {
280            Ok(core) => core,
281            Err(e) => {
282                if spec.fail_fast {
283                    return Err(BuildCoresError::CertmeshInit(e.to_string()));
284                }
285                tracing::error!(error = %e, "certmesh init task panicked");
286                None
287            }
288        }
289    } else {
290        tracing::info!("Certmesh capability: disabled");
291        None
292    };
293
294    // ── Integration bridges ──
295    // These wrap domain cores and implement cross-domain traits from koi_common::integration.
296    let mdns_bridge: Option<Arc<dyn MdnsSnapshot>> = if let Some(ref core) = mdns_core {
297        Some(crate::bridges::MdnsBridge::spawn(core.clone()).await)
298    } else {
299        None
300    };
301
302    let certmesh_bridge: Option<Arc<dyn CertmeshSnapshot>> = certmesh_core
303        .as_ref()
304        .map(|core| crate::bridges::CertmeshBridge::new(core.clone()) as Arc<dyn CertmeshSnapshot>);
305
306    let alias_feedback: Option<Arc<dyn AliasFeedback>> = certmesh_core.as_ref().map(|core| {
307        crate::bridges::AliasFeedbackBridge::new(core.clone()) as Arc<dyn AliasFeedback>
308    });
309
310    // ── DNS (consumes mdns + certmesh + alias bridges) ──
311    let dns_runtime = if !spec.no_dns {
312        // Pin the DNS state path when the caller supplied one (embedded pins it to its data
313        // dir to stay immune to KOI_DATA_DIR env races in parallel tests).
314        let mut dns_config = spec.dns_config.clone();
315        if let Some(ref path) = spec.dns_state_path {
316            dns_config.state_path = Some(path.clone());
317        }
318        let core = koi_dns::DnsCore::new(
319            dns_config,
320            mdns_bridge.clone(),
321            certmesh_bridge.clone(),
322            alias_feedback,
323        )
324        .await;
325        match core {
326            Ok(core) => {
327                let runtime = Arc::new(koi_dns::DnsRuntime::new(core));
328                if spec.dns_auto_start {
329                    if let Err(e) = runtime.start().await {
330                        if spec.fail_fast {
331                            return Err(e.into());
332                        }
333                        tracing::error!(error = %e, "Failed to start DNS server");
334                    }
335                }
336                Some(runtime)
337            }
338            Err(e) => {
339                if spec.fail_fast {
340                    return Err(e.into());
341                }
342                tracing::error!(error = %e, "Failed to initialize DNS core");
343                None
344            }
345        }
346    } else {
347        tracing::info!("DNS capability: disabled");
348        None
349    };
350
351    // ── Proxy ──
352    let proxy_runtime = if !spec.no_proxy {
353        // Scope the proxy state to the caller's data dir when supplied (embedded), else use
354        // the platform-default state (the daemon).
355        let core = match spec.proxy_data_dir {
356            Some(ref dir) => koi_proxy::ProxyCore::with_data_dir(dir),
357            None => koi_proxy::ProxyCore::new(),
358        };
359        match core {
360            Ok(core) => {
361                let runtime = Arc::new(koi_proxy::ProxyRuntime::new(Arc::new(core)));
362                if spec.proxy_auto_start {
363                    if let Err(e) = runtime.start_all().await {
364                        if spec.fail_fast {
365                            return Err(e.into());
366                        }
367                        tracing::error!(error = %e, "Failed to start proxy listeners");
368                    }
369                }
370                Some(runtime)
371            }
372            Err(e) => {
373                if spec.fail_fast {
374                    return Err(e.into());
375                }
376                tracing::error!(error = %e, "Failed to initialize proxy core");
377                None
378            }
379        }
380    } else {
381        tracing::info!("Proxy capability: disabled");
382        None
383    };
384
385    let dns_bridge: Option<Arc<dyn DnsProbe>> = dns_runtime
386        .as_ref()
387        .map(|rt| crate::bridges::DnsBridge::new(rt.clone()) as Arc<dyn DnsProbe>);
388
389    let proxy_bridge: Option<Arc<dyn ProxySnapshot>> = proxy_runtime
390        .as_ref()
391        .map(|rt| crate::bridges::ProxyBridge::new(rt.core()) as Arc<dyn ProxySnapshot>);
392
393    // ── Health (consumes mdns + dns + certmesh + proxy bridges) ──
394    let health_runtime = if !spec.no_health {
395        let core = Arc::new(
396            koi_health::HealthCore::new(
397                mdns_bridge.clone(),
398                dns_bridge,
399                certmesh_bridge,
400                proxy_bridge,
401            )
402            .await,
403        );
404        let runtime = Arc::new(koi_health::HealthRuntime::new(core));
405        if spec.health_auto_start {
406            if let Err(e) = runtime.start().await {
407                if spec.fail_fast {
408                    return Err(e.into());
409                }
410                tracing::error!(error = %e, "Failed to start health checks");
411            }
412        }
413        Some(runtime)
414    } else {
415        tracing::info!("Health capability: disabled");
416        None
417    };
418
419    // ── UDP ──
420    let udp_runtime = if !spec.no_udp {
421        Some(Arc::new(koi_udp::UdpRuntime::new(cancel.clone())))
422    } else {
423        tracing::info!("UDP capability: disabled");
424        None
425    };
426
427    // ── Runtime adapter ──
428    let runtime_core = if !spec.no_runtime {
429        // No silent fallback: an unrecognized backend selector disables the
430        // runtime adapter rather than quietly running Auto. The CLI rejects bad
431        // values at parse time; this guards the service/env path.
432        match koi_runtime::RuntimeBackendKind::from_str_loose(&spec.runtime) {
433            Some(backend_kind) => {
434                let rt_config = koi_runtime::RuntimeConfig {
435                    backend_kind,
436                    socket_path: None,
437                };
438                let core = Arc::new(koi_runtime::RuntimeCore::new(rt_config));
439                match core.start_watching(cancel.clone()).await {
440                    Ok(()) => Some(core),
441                    Err(e) => {
442                        tracing::warn!(error = %e, "Runtime adapter unavailable, continuing without it");
443                        None
444                    }
445                }
446            }
447            None => {
448                tracing::error!(
449                    value = %spec.runtime,
450                    accepted = ?koi_runtime::RuntimeBackendKind::ACCEPTED,
451                    "Unknown runtime backend; disabling runtime adapter"
452                );
453                None
454            }
455        }
456    } else {
457        tracing::info!("Runtime capability: disabled");
458        None
459    };
460
461    // ── Runtime orchestrator ──
462    // Translates container lifecycle events into mDNS/DNS/health/proxy operations. The
463    // daemon always spawns it; embedded opts in (a leaf host wants only the event stream).
464    if spec.spawn_orchestrator {
465        if let Some(ref rt) = runtime_core {
466            tasks.push(crate::orchestrator::spawn_orchestrator(
467                rt,
468                crate::orchestrator::OrchestrationTargets {
469                    mdns: mdns_core.clone(),
470                    dns: dns_runtime.clone(),
471                    health: health_runtime.clone(),
472                    proxy: proxy_runtime.clone(),
473                },
474                cancel.clone(),
475            ));
476        }
477    }
478
479    let cores = Cores {
480        mdns: mdns_core,
481        certmesh: certmesh_core,
482        dns: dns_runtime,
483        health: health_runtime,
484        proxy: proxy_runtime,
485        udp: udp_runtime,
486        runtime: runtime_core,
487        mdns_snapshot: mdns_bridge,
488    };
489
490    // ── Certmesh role background loops ──
491    // The daemon always runs them; embedded opts in (a leaf does not need renewal/pull). The
492    // approval pump is spawned by the caller in every case (its decider differs by host).
493    if spec.spawn_certmesh_loops {
494        if let Some(ref certmesh) = cores.certmesh {
495            crate::certmesh::spawn_certmesh_background_tasks(certmesh, cancel, tasks);
496        }
497    }
498
499    tracing::debug!("Domain cores built");
500    Ok(cores)
501}
502
503/// Ordered teardown: cancel → drain in-flight → join tasks → core goodbye (mDNS, DNS, health,
504/// proxy, UDP). Bounded by `timeout`. The self-announce and trust-plane supervisors (in
505/// `tasks`) own their mDNS records and withdraw them on cancel — during the task-join step
506/// here — so no announce id is threaded through this teardown.
507pub async fn ordered_shutdown(
508    cancel: &CancellationToken,
509    tasks: Vec<JoinHandle<()>>,
510    cores: &Cores,
511    timeout: Duration,
512    drain: Duration,
513) {
514    let shutdown = async {
515        cancel.cancel();
516        tokio::time::sleep(drain).await;
517        for task in tasks {
518            let _ = task.await;
519        }
520        if let Some(ref core) = cores.mdns {
521            if let Err(e) = core.shutdown().await {
522                tracing::warn!(error = %e, "Error during mDNS shutdown");
523            }
524        }
525        if let Some(ref dns) = cores.dns {
526            dns.stop().await;
527        }
528        if let Some(ref health) = cores.health {
529            let _ = health.stop().await;
530        }
531        if let Some(ref proxy) = cores.proxy {
532            let _ = proxy.stop_all().await;
533        }
534        if let Some(ref udp) = cores.udp {
535            udp.shutdown().await;
536        }
537    };
538    if tokio::time::timeout(timeout, shutdown).await.is_err() {
539        tracing::warn!("Shutdown timed out after {:?} - forcing exit", timeout);
540    }
541}
542
543#[cfg(test)]
544mod tests {
545    use super::*;
546    use koi_certmesh::{CertmeshCore, CertmeshPaths};
547
548    /// Regression guard for ADR-017 F11: the **real boot path** (`init_certmesh_core`,
549    /// not the unused `try_auto_unlock`) must refuse to auto-unlock when the machine
550    /// fingerprint changed since CA creation (a VM clone / disk restore). The fix
551    /// that wires `machine_binding_ok` into this path is exactly what an earlier
552    /// implementation missed — this test ensures it can't silently un-wire again.
553    #[tokio::test]
554    async fn init_certmesh_core_refuses_auto_unlock_on_machine_change() {
555        let base = koi_common::test::ensure_data_dir("koi-compose-cores-tests").join("f11-boot");
556        let _ = std::fs::remove_dir_all(&base);
557        let paths = CertmeshPaths::with_data_dir(base.clone());
558
559        // Create a CA with auto-unlock — records the vault key + machine.bind.
560        let core = CertmeshCore::uninitialized_with_paths(paths.clone());
561        core.create(koi_certmesh::protocol::CreateCaRequest {
562            passphrase: "f11-boot-pass".into(),
563            entropy_hex: "11".repeat(32),
564            operator: None,
565            enrollment_open: true,
566            requires_approval: false,
567            auto_unlock: true,
568            totp_secret_hex: None,
569        })
570        .await
571        .expect("CA create");
572
573        // Same host (machine.bind matches) → the boot path auto-unlocks.
574        let booted = init_certmesh_core(Some(&base)).expect("core");
575        assert!(
576            !booted.certmesh_status().await.ca_locked,
577            "matching machine binding should auto-unlock at boot"
578        );
579
580        // Simulate a clone/restore: overwrite the recorded fingerprint. The boot
581        // path must now refuse auto-unlock and come up LOCKED.
582        std::fs::write(paths.machine_bind_path(), b"not-this-host-fingerprint").unwrap();
583        let booted_after = init_certmesh_core(Some(&base)).expect("core");
584        assert!(
585            booted_after.certmesh_status().await.ca_locked,
586            "a changed machine fingerprint must refuse auto-unlock at boot (F11)"
587        );
588
589        let _ = std::fs::remove_dir_all(&base);
590    }
591}