Skip to main content

koi_compose/
cores.rs

1//! Daemon core composition — the single place that constructs every domain core, wires the
2//! cross-domain integration bridges between them, spawns the domain background tasks
3//! (orchestrator + certmesh role loops), and tears it all down in order.
4//!
5//! Before P07 this graph was hand-written twice — in the binary's `daemon_mode` and again
6//! in the Windows service's `run_service` — and the two had already drifted into a verified
7//! `koi install` defect. [`build_cores`] is now the one copy both call, so the daemon they
8//! construct is identical by construction.
9//!
10//! The enrollment-approval pump is intentionally *not* spawned here: its decider differs by
11//! host (the foreground daemon prompts on stdin; consoleless hosts deny). The caller spawns
12//! it via [`crate::certmesh::spawn_enrollment_approval`].
13
14use std::path::Path;
15use std::sync::Arc;
16use std::time::Duration;
17
18use tokio::task::JoinHandle;
19use tokio_util::sync::CancellationToken;
20
21use koi_common::integration::{
22    AliasFeedback, CertmeshSnapshot, DnsProbe, MdnsSnapshot, ProxySnapshot,
23};
24
25/// The set of domain cores a daemon runs. Each is present only if its capability is enabled
26/// (via the `no_*` flags in [`CoreSpec`]).
27#[derive(Clone, Default)]
28pub struct Cores {
29    pub mdns: Option<Arc<koi_mdns::MdnsCore>>,
30    pub certmesh: Option<Arc<koi_certmesh::CertmeshCore>>,
31    pub dns: Option<Arc<koi_dns::DnsRuntime>>,
32    pub health: Option<Arc<koi_health::HealthRuntime>>,
33    pub proxy: Option<Arc<koi_proxy::ProxyRuntime>>,
34    pub udp: Option<Arc<koi_udp::UdpRuntime>>,
35    pub runtime: Option<Arc<koi_runtime::RuntimeCore>>,
36    /// The shared mDNS cached-records snapshot bridge (same instance DNS/health
37    /// consume). Exposed so presentation adapters (e.g. the Prometheus SD endpoint's
38    /// `?include=discovered` slice) can read cached records without spawning a second
39    /// meta-browse. `None` when mDNS is disabled.
40    pub mdns_snapshot: Option<Arc<dyn MdnsSnapshot>>,
41}
42
43/// Capability flags + inputs needed to build the cores. A daemon-`Config` subset, kept here
44/// (rather than depending on the binary's `Config`) so koi-compose stays standalone.
45pub struct CoreSpec {
46    pub no_mdns: bool,
47    pub no_certmesh: bool,
48    pub no_dns: bool,
49    pub no_health: bool,
50    pub no_proxy: bool,
51    pub no_udp: bool,
52    pub no_runtime: bool,
53    /// Data directory for certmesh state (resolved by the caller).
54    pub data_dir: std::path::PathBuf,
55    /// DNS configuration (the caller's resolved `DnsConfig`).
56    pub dns_config: koi_dns::DnsConfig,
57    /// Runtime backend selector string ("auto", "docker", "podman", …).
58    pub runtime: String,
59    /// Daemon HTTP port (the local management/API port the daemon binds).
60    pub http_port: u16,
61}
62
63/// Initialize the certmesh core, auto-unlocking from the vault when a key is present.
64///
65/// Always returns `Some` (so HTTP routes mount even before `koi certmesh create`):
66/// - CA not initialized → an uninitialized core (routes reachable for `/create`);
67/// - CA initialized + a vault auto-unlock key present → booted **already unlocked**,
68///   collapsing the old "create locked → read key → unlock" three-step;
69/// - CA initialized + no key (or decryption fails) → a locked core.
70///
71/// This is the converged single definition shared by the daemon, the Windows service, and
72/// koi-embedded (the daemon path thereby gains the vault auto-unlock embedded already had).
73pub fn init_certmesh_core(data_dir: Option<&Path>) -> Option<Arc<koi_certmesh::CertmeshCore>> {
74    // Composition root: resolve the data dir once (Some -> injected dir, None -> the one
75    // default) and thread it into every branch so a custom data_dir is honoured end-to-end,
76    // including the early returns.
77    let paths = koi_certmesh::CertmeshPaths::with_data_dir(
78        koi_common::paths::koi_data_dir_with_override(data_dir),
79    );
80    if !paths.is_ca_initialized() {
81        tracing::info!("Certmesh: CA not initialized - routes mounted for /create");
82        return Some(Arc::new(
83            koi_certmesh::CertmeshCore::uninitialized_with_paths(paths),
84        ));
85    }
86
87    let roster_path = paths.roster_path();
88    let roster = match koi_certmesh::roster::load_roster(&roster_path) {
89        Ok(r) => r,
90        Err(e) => {
91            tracing::warn!(error = %e, "Failed to load certmesh roster - using uninitialized state");
92            return Some(Arc::new(
93                koi_certmesh::CertmeshCore::uninitialized_with_paths(paths),
94            ));
95        }
96    };
97
98    // ── F11 machine binding: refuse auto-unlock on a changed host ───────
99    // Checked BEFORE reading the vault key so a VM clone / disk restore onto new
100    // hardware (different machine-id) boots LOCKED instead of auto-unlocking with
101    // the copied vault key. Fail-safe + audited; a legitimate migration recovers
102    // with a one-time manual `koi certmesh unlock`.
103    let machine_ok = koi_certmesh::machine_binding_ok(&paths);
104    if !machine_ok {
105        let _ = koi_certmesh::audit::append_entry_to(
106            &paths.audit_log_path(),
107            "auto_unlock_refused_machine_changed",
108            &[],
109        );
110        tracing::error!(
111            "Certmesh: machine fingerprint changed since CA creation (clone/restore?) — \
112             booting LOCKED; run `koi certmesh unlock` to unlock manually on this host"
113        );
114    }
115
116    // ── Auto-unlock at init: single source of truth ─────────────
117    // The auto-unlock passphrase lives in the koi-crypto vault (written by
118    // CertmeshCore::save_auto_unlock_key_at, which deletes any legacy plaintext file).
119    // Retrieve it through the domain crate so this boot path can never drift from where the
120    // key is actually stored. When a key is present, boot the core already unlocked.
121    if machine_ok {
122        if let Ok(Some(pp)) = koi_certmesh::CertmeshCore::read_auto_unlock_key(&paths) {
123            match koi_certmesh::ca::load_ca(&pp, &paths) {
124                Ok(ca_state) => {
125                    // Reload roster (fresh copy for the new Arc)
126                    if let Ok(fresh_roster) = koi_certmesh::roster::load_roster(&roster_path) {
127                        let auth_path = paths.auth_path();
128                        let auth = if auth_path.exists() {
129                            std::fs::read_to_string(&auth_path)
130                                .ok()
131                                .and_then(|json| {
132                                    serde_json::from_str::<koi_crypto::auth::StoredAuth>(&json).ok()
133                                })
134                                .and_then(|stored| stored.unlock(&pp).ok())
135                        } else {
136                            None
137                        };
138
139                        tracing::info!("Certmesh CA auto-unlocked at init from vault");
140                        return Some(Arc::new(koi_certmesh::CertmeshCore::new_with_paths(
141                            ca_state,
142                            fresh_roster,
143                            auth,
144                            paths,
145                        )));
146                    }
147                }
148                Err(e) => {
149                    tracing::warn!(
150                        error = %e,
151                        "Auto-unlock key exists in vault but CA decryption failed"
152                    );
153                }
154            }
155        }
156    }
157
158    // No auto-unlock key - boot locked
159    tracing::info!("Certmesh: CA initialized (locked, use `koi certmesh unlock` to decrypt)");
160    let core = koi_certmesh::CertmeshCore::locked_with_paths(roster, paths);
161    Some(Arc::new(core))
162}
163
164/// Build all domain cores + cross-domain bridges, then spawn the caller-invariant domain
165/// background tasks: the runtime orchestrator (when runtime is enabled) and the certmesh
166/// role loops (when certmesh is enabled). Returns the assembled [`Cores`].
167///
168/// The bridges are wired in dependency order: DNS consumes the mDNS/certmesh/alias bridges;
169/// health consumes the mDNS/DNS/certmesh/proxy bridges. Disabled capabilities pass `None`.
170pub async fn build_cores(
171    spec: &CoreSpec,
172    cancel: &CancellationToken,
173    tasks: &mut Vec<JoinHandle<()>>,
174) -> Cores {
175    // ── mDNS ──
176    let mdns_core = if !spec.no_mdns {
177        match koi_mdns::MdnsCore::with_cancel(cancel.clone()) {
178            Ok(core) => Some(Arc::new(core)),
179            Err(e) => {
180                tracing::error!(error = %e, "Failed to initialize mDNS core");
181                None
182            }
183        }
184    } else {
185        tracing::info!("mDNS capability: disabled");
186        None
187    };
188
189    // ── Certmesh ──
190    let certmesh_core = if !spec.no_certmesh {
191        init_certmesh_core(Some(&spec.data_dir))
192    } else {
193        tracing::info!("Certmesh capability: disabled");
194        None
195    };
196
197    // ── Integration bridges ──
198    // These wrap domain cores and implement cross-domain traits from koi_common::integration.
199    let mdns_bridge: Option<Arc<dyn MdnsSnapshot>> = if let Some(ref core) = mdns_core {
200        Some(crate::bridges::MdnsBridge::spawn(core.clone()).await)
201    } else {
202        None
203    };
204
205    let certmesh_bridge: Option<Arc<dyn CertmeshSnapshot>> = certmesh_core
206        .as_ref()
207        .map(|core| crate::bridges::CertmeshBridge::new(core.clone()) as Arc<dyn CertmeshSnapshot>);
208
209    let alias_feedback: Option<Arc<dyn AliasFeedback>> = certmesh_core.as_ref().map(|core| {
210        crate::bridges::AliasFeedbackBridge::new(core.clone()) as Arc<dyn AliasFeedback>
211    });
212
213    // ── DNS (consumes mdns + certmesh + alias bridges) ──
214    let dns_runtime = if !spec.no_dns {
215        let core = koi_dns::DnsCore::new(
216            spec.dns_config.clone(),
217            mdns_bridge.clone(),
218            certmesh_bridge.clone(),
219            alias_feedback,
220        )
221        .await;
222        match core {
223            Ok(core) => {
224                let runtime = Arc::new(koi_dns::DnsRuntime::new(core));
225                if let Err(e) = runtime.start().await {
226                    tracing::error!(error = %e, "Failed to start DNS server");
227                }
228                Some(runtime)
229            }
230            Err(e) => {
231                tracing::error!(error = %e, "Failed to initialize DNS core");
232                None
233            }
234        }
235    } else {
236        tracing::info!("DNS capability: disabled");
237        None
238    };
239
240    // ── Proxy ──
241    let proxy_runtime = if !spec.no_proxy {
242        match koi_proxy::ProxyCore::new() {
243            Ok(core) => {
244                let runtime = Arc::new(koi_proxy::ProxyRuntime::new(Arc::new(core)));
245                if let Err(e) = runtime.start_all().await {
246                    tracing::error!(error = %e, "Failed to start proxy listeners");
247                }
248                Some(runtime)
249            }
250            Err(e) => {
251                tracing::error!(error = %e, "Failed to initialize proxy core");
252                None
253            }
254        }
255    } else {
256        tracing::info!("Proxy capability: disabled");
257        None
258    };
259
260    let dns_bridge: Option<Arc<dyn DnsProbe>> = dns_runtime
261        .as_ref()
262        .map(|rt| crate::bridges::DnsBridge::new(rt.clone()) as Arc<dyn DnsProbe>);
263
264    let proxy_bridge: Option<Arc<dyn ProxySnapshot>> = proxy_runtime
265        .as_ref()
266        .map(|rt| crate::bridges::ProxyBridge::new(rt.core()) as Arc<dyn ProxySnapshot>);
267
268    // ── Health (consumes mdns + dns + certmesh + proxy bridges) ──
269    let health_runtime = if !spec.no_health {
270        let core = Arc::new(
271            koi_health::HealthCore::new(
272                mdns_bridge.clone(),
273                dns_bridge,
274                certmesh_bridge,
275                proxy_bridge,
276            )
277            .await,
278        );
279        let runtime = Arc::new(koi_health::HealthRuntime::new(core));
280        if let Err(e) = runtime.start().await {
281            tracing::error!(error = %e, "Failed to start health checks");
282        }
283        Some(runtime)
284    } else {
285        tracing::info!("Health capability: disabled");
286        None
287    };
288
289    // ── UDP ──
290    let udp_runtime = if !spec.no_udp {
291        Some(Arc::new(koi_udp::UdpRuntime::new(cancel.clone())))
292    } else {
293        tracing::info!("UDP capability: disabled");
294        None
295    };
296
297    // ── Runtime adapter ──
298    let runtime_core = if !spec.no_runtime {
299        // No silent fallback: an unrecognized backend selector disables the
300        // runtime adapter rather than quietly running Auto. The CLI rejects bad
301        // values at parse time; this guards the service/env path.
302        match koi_runtime::RuntimeBackendKind::from_str_loose(&spec.runtime) {
303            Some(backend_kind) => {
304                let rt_config = koi_runtime::RuntimeConfig {
305                    backend_kind,
306                    socket_path: None,
307                };
308                let core = Arc::new(koi_runtime::RuntimeCore::new(rt_config));
309                match core.start_watching(cancel.clone()).await {
310                    Ok(()) => Some(core),
311                    Err(e) => {
312                        tracing::warn!(error = %e, "Runtime adapter unavailable, continuing without it");
313                        None
314                    }
315                }
316            }
317            None => {
318                tracing::error!(
319                    value = %spec.runtime,
320                    accepted = ?koi_runtime::RuntimeBackendKind::ACCEPTED,
321                    "Unknown runtime backend; disabling runtime adapter"
322                );
323                None
324            }
325        }
326    } else {
327        tracing::info!("Runtime capability: disabled");
328        None
329    };
330
331    // ── Runtime orchestrator ──
332    // Translates container lifecycle events into mDNS/DNS/health/proxy operations.
333    if let Some(ref rt) = runtime_core {
334        tasks.push(crate::orchestrator::spawn_orchestrator(
335            rt,
336            crate::orchestrator::OrchestrationTargets {
337                mdns: mdns_core.clone(),
338                dns: dns_runtime.clone(),
339                health: health_runtime.clone(),
340                proxy: proxy_runtime.clone(),
341            },
342            cancel.clone(),
343        ));
344    }
345
346    let cores = Cores {
347        mdns: mdns_core,
348        certmesh: certmesh_core,
349        dns: dns_runtime,
350        health: health_runtime,
351        proxy: proxy_runtime,
352        udp: udp_runtime,
353        runtime: runtime_core,
354        mdns_snapshot: mdns_bridge,
355    };
356
357    // ── Certmesh role background loops (caller-invariant) ──
358    // The approval pump is spawned by the caller (its decider differs by host).
359    if let Some(ref certmesh) = cores.certmesh {
360        crate::certmesh::spawn_certmesh_background_tasks(certmesh, cancel, tasks);
361    }
362
363    tracing::debug!("Domain cores built");
364    cores
365}
366
367/// Ordered teardown: cancel → drain in-flight → join tasks → withdraw the HTTP mDNS
368/// announcement → core goodbye (mDNS, DNS, health, proxy, UDP). Bounded by `timeout`.
369pub async fn ordered_shutdown(
370    cancel: &CancellationToken,
371    tasks: Vec<JoinHandle<()>>,
372    cores: &Cores,
373    http_announce_id: Option<String>,
374    timeout: Duration,
375    drain: Duration,
376) {
377    let shutdown = async {
378        cancel.cancel();
379        tokio::time::sleep(drain).await;
380        for task in tasks {
381            let _ = task.await;
382        }
383        if let Some(ref id) = http_announce_id {
384            if let Some(ref core) = cores.mdns {
385                if let Err(e) = core.unregister(id) {
386                    tracing::warn!(error = %e, "Failed to withdraw HTTP mDNS announcement");
387                }
388            }
389        }
390        if let Some(ref core) = cores.mdns {
391            if let Err(e) = core.shutdown().await {
392                tracing::warn!(error = %e, "Error during mDNS shutdown");
393            }
394        }
395        if let Some(ref dns) = cores.dns {
396            dns.stop().await;
397        }
398        if let Some(ref health) = cores.health {
399            let _ = health.stop().await;
400        }
401        if let Some(ref proxy) = cores.proxy {
402            let _ = proxy.stop_all().await;
403        }
404        if let Some(ref udp) = cores.udp {
405            udp.shutdown().await;
406        }
407    };
408    if tokio::time::timeout(timeout, shutdown).await.is_err() {
409        tracing::warn!("Shutdown timed out after {:?} - forcing exit", timeout);
410    }
411}
412
413#[cfg(test)]
414mod tests {
415    use super::*;
416    use koi_certmesh::{CertmeshCore, CertmeshPaths};
417
418    /// Regression guard for ADR-017 F11: the **real boot path** (`init_certmesh_core`,
419    /// not the unused `try_auto_unlock`) must refuse to auto-unlock when the machine
420    /// fingerprint changed since CA creation (a VM clone / disk restore). The fix
421    /// that wires `machine_binding_ok` into this path is exactly what an earlier
422    /// implementation missed — this test ensures it can't silently un-wire again.
423    #[tokio::test]
424    async fn init_certmesh_core_refuses_auto_unlock_on_machine_change() {
425        let base = koi_common::test::ensure_data_dir("koi-compose-cores-tests").join("f11-boot");
426        let _ = std::fs::remove_dir_all(&base);
427        let paths = CertmeshPaths::with_data_dir(base.clone());
428
429        // Create a CA with auto-unlock — records the vault key + machine.bind.
430        let core = CertmeshCore::uninitialized_with_paths(paths.clone());
431        core.create(koi_certmesh::protocol::CreateCaRequest {
432            passphrase: "f11-boot-pass".into(),
433            entropy_hex: "11".repeat(32),
434            operator: None,
435            enrollment_open: true,
436            requires_approval: false,
437            auto_unlock: true,
438            totp_secret_hex: None,
439        })
440        .await
441        .expect("CA create");
442
443        // Same host (machine.bind matches) → the boot path auto-unlocks.
444        let booted = init_certmesh_core(Some(&base)).expect("core");
445        assert!(
446            !booted.certmesh_status().await.ca_locked,
447            "matching machine binding should auto-unlock at boot"
448        );
449
450        // Simulate a clone/restore: overwrite the recorded fingerprint. The boot
451        // path must now refuse auto-unlock and come up LOCKED.
452        std::fs::write(paths.machine_bind_path(), b"not-this-host-fingerprint").unwrap();
453        let booted_after = init_certmesh_core(Some(&base)).expect("core");
454        assert!(
455            booted_after.certmesh_status().await.ca_locked,
456            "a changed machine fingerprint must refuse auto-unlock at boot (F11)"
457        );
458
459        let _ = std::fs::remove_dir_all(&base);
460    }
461}