koi_compose/cores.rs
1//! Daemon core composition — the single place that constructs every domain core, wires the
2//! cross-domain integration bridges between them, spawns the domain background tasks
3//! (orchestrator + certmesh role loops), and tears it all down in order.
4//!
5//! Before P07 this graph was hand-written twice — in the binary's `daemon_mode` and again
6//! in the Windows service's `run_service` — and the two had already drifted into a verified
7//! `koi install` defect. [`build_cores`] is now the one copy both call, so the daemon they
8//! construct is identical by construction.
9//!
10//! The enrollment-approval pump is intentionally *not* spawned here: its decider differs by
11//! host (the foreground daemon prompts on stdin; consoleless hosts deny). The caller spawns
12//! it via [`crate::certmesh::spawn_enrollment_approval`].
13
14use std::path::Path;
15use std::sync::Arc;
16use std::time::Duration;
17
18use tokio::task::JoinHandle;
19use tokio_util::sync::CancellationToken;
20
21use koi_common::integration::{
22 AliasFeedback, CertmeshSnapshot, DnsProbe, MdnsSnapshot, ProxySnapshot,
23};
24
25/// The set of domain cores a daemon runs. Each is present only if its capability is enabled
26/// (via the `no_*` flags in [`CoreSpec`]).
27#[derive(Clone, Default)]
28pub struct Cores {
29 pub mdns: Option<Arc<koi_mdns::MdnsCore>>,
30 pub certmesh: Option<Arc<koi_certmesh::CertmeshCore>>,
31 pub dns: Option<Arc<koi_dns::DnsRuntime>>,
32 pub health: Option<Arc<koi_health::HealthRuntime>>,
33 pub proxy: Option<Arc<koi_proxy::ProxyRuntime>>,
34 pub udp: Option<Arc<koi_udp::UdpRuntime>>,
35 pub runtime: Option<Arc<koi_runtime::RuntimeCore>>,
36 /// The shared mDNS cached-records snapshot bridge (same instance DNS/health
37 /// consume). Exposed so presentation adapters (e.g. the Prometheus SD endpoint's
38 /// `?include=discovered` slice) can read cached records without spawning a second
39 /// meta-browse. `None` when mDNS is disabled.
40 pub mdns_snapshot: Option<Arc<dyn MdnsSnapshot>>,
41}
42
43/// Error from [`build_cores`] when `fail_fast` is set (koi-embedded's library contract).
44/// With `fail_fast = false` (the daemon/service default) `build_cores` never returns this —
45/// a capability that fails to initialize is logged and dropped and the daemon keeps running.
46#[derive(Debug, thiserror::Error)]
47pub enum BuildCoresError {
48 #[error("mDNS core init failed: {0}")]
49 Mdns(#[from] koi_mdns::MdnsError),
50 #[error("DNS init/start failed: {0}")]
51 Dns(#[from] koi_dns::DnsError),
52 #[error("proxy init/start failed: {0}")]
53 Proxy(#[from] koi_proxy::ProxyError),
54 #[error("health start failed: {0}")]
55 Health(#[from] koi_health::HealthError),
56 #[error("certmesh init task panicked: {0}")]
57 CertmeshInit(String),
58}
59
60/// Capability flags + inputs needed to build the cores. A daemon-`Config` subset, kept here
61/// (rather than depending on the binary's `Config`) so koi-compose stays standalone.
62///
63/// The daemon and the Windows service fill it via [`CoreSpec::daemon`]; `koi-embedded` sets
64/// the embedded-only forks (data-dir-scoped proxy, pinned DNS state path, the auto-start +
65/// background-loop opt-ins) directly. Every field has a daemon default so the two boot paths
66/// build the identical graph.
67pub struct CoreSpec {
68 pub no_mdns: bool,
69 pub no_certmesh: bool,
70 pub no_dns: bool,
71 pub no_health: bool,
72 pub no_proxy: bool,
73 pub no_udp: bool,
74 pub no_runtime: bool,
75 /// Data directory for certmesh state. `None` uses the platform default (embedded leaves
76 /// it unset when the host did not pin one); the daemon always resolves a concrete dir.
77 pub data_dir: Option<std::path::PathBuf>,
78 /// DNS configuration (the caller's resolved `DnsConfig`).
79 pub dns_config: koi_dns::DnsConfig,
80 /// Runtime backend selector string ("auto", "docker", "podman", …).
81 pub runtime: String,
82 /// Daemon HTTP port (the local management/API port the daemon binds).
83 pub http_port: u16,
84 /// Override the DNS state file path (embedded pins it to its data dir to be immune to
85 /// `KOI_DATA_DIR` races in parallel tests). `None` keeps the `dns_config` value.
86 pub dns_state_path: Option<std::path::PathBuf>,
87 /// Build the proxy core scoped to this data dir (`ProxyCore::with_data_dir`). `None`
88 /// uses the platform-default proxy state (the daemon's behavior).
89 pub proxy_data_dir: Option<std::path::PathBuf>,
90 /// Start the DNS server after constructing its core. The daemon always does; embedded
91 /// gates it on `dns_auto_start`.
92 pub dns_auto_start: bool,
93 /// Start health checks after constructing the core (daemon: always; embedded: opt-in).
94 pub health_auto_start: bool,
95 /// Start the proxy listeners after constructing the core (daemon: always; embedded: opt-in).
96 pub proxy_auto_start: bool,
97 /// Spawn the runtime orchestrator when the runtime adapter is present (daemon: always;
98 /// embedded: opt-in via the `orchestrator` builder flag).
99 pub spawn_orchestrator: bool,
100 /// Spawn the certmesh role-driven background loop when certmesh is present (daemon:
101 /// always; embedded: opt-in via the `certmesh_background` builder flag).
102 pub spawn_certmesh_loops: bool,
103 /// Fail-fast contract: when `true` (koi-embedded, a library), the first core that fails to
104 /// initialize or auto-start aborts `build_cores` with [`BuildCoresError`]. When `false`
105 /// (the daemon/service), failures are logged and that capability is dropped so the daemon
106 /// keeps serving its remaining capabilities.
107 pub fail_fast: bool,
108}
109
110impl CoreSpec {
111 /// The daemon/Windows-service defaults for the embedded-fork fields: platform-default
112 /// proxy/DNS state, always start DNS/health/proxy, always spawn the orchestrator and the
113 /// certmesh loops. Spread it into a struct literal so the daemon only names the
114 /// capability flags + resolved inputs and cannot accidentally diverge from the service.
115 ///
116 /// ```ignore
117 /// CoreSpec { no_mdns, /* … */, data_dir: Some(dir), dns_config, runtime, http_port,
118 /// ..CoreSpec::daemon_defaults() }
119 /// ```
120 pub fn daemon_defaults() -> Self {
121 Self {
122 no_mdns: false,
123 no_certmesh: false,
124 no_dns: false,
125 no_health: false,
126 no_proxy: false,
127 no_udp: false,
128 no_runtime: false,
129 data_dir: None,
130 dns_config: koi_dns::DnsConfig::default(),
131 runtime: "auto".to_string(),
132 http_port: 0,
133 dns_state_path: None,
134 proxy_data_dir: None,
135 dns_auto_start: true,
136 health_auto_start: true,
137 proxy_auto_start: true,
138 spawn_orchestrator: true,
139 spawn_certmesh_loops: true,
140 fail_fast: false,
141 }
142 }
143}
144
145/// Initialize the certmesh core, auto-unlocking from the vault when a key is present.
146///
147/// Always returns `Some` (so HTTP routes mount even before `koi certmesh create`):
148/// - CA not initialized → an uninitialized core (routes reachable for `/create`);
149/// - CA initialized + a vault auto-unlock key present → booted **already unlocked**,
150/// collapsing the old "create locked → read key → unlock" three-step;
151/// - CA initialized + no key (or decryption fails) → a locked core.
152///
153/// This is the converged single definition shared by the daemon, the Windows service, and
154/// koi-embedded (the daemon path thereby gains the vault auto-unlock embedded already had).
155pub fn init_certmesh_core(data_dir: Option<&Path>) -> Option<Arc<koi_certmesh::CertmeshCore>> {
156 // Composition root: resolve the data dir once (Some -> injected dir, None -> the one
157 // default) and thread it into every branch so a custom data_dir is honoured end-to-end,
158 // including the early returns.
159 let paths = koi_certmesh::CertmeshPaths::with_data_dir(
160 koi_common::paths::koi_data_dir_with_override(data_dir),
161 );
162 if !paths.is_ca_initialized() {
163 tracing::info!("Certmesh: CA not initialized - routes mounted for /create");
164 return Some(Arc::new(
165 koi_certmesh::CertmeshCore::uninitialized_with_paths(paths),
166 ));
167 }
168
169 let roster_path = paths.roster_path();
170 let roster = match koi_certmesh::roster::load_roster(&roster_path) {
171 Ok(r) => r,
172 Err(e) => {
173 tracing::warn!(error = %e, "Failed to load certmesh roster - using uninitialized state");
174 return Some(Arc::new(
175 koi_certmesh::CertmeshCore::uninitialized_with_paths(paths),
176 ));
177 }
178 };
179
180 // ── F11 machine binding: refuse auto-unlock on a changed host ───────
181 // Checked BEFORE reading the vault key so a VM clone / disk restore onto new
182 // hardware (different machine-id) boots LOCKED instead of auto-unlocking with
183 // the copied vault key. Fail-safe + audited; a legitimate migration recovers
184 // with a one-time manual `koi certmesh unlock`.
185 let machine_ok = koi_certmesh::machine_binding_ok(&paths);
186 if !machine_ok {
187 let _ = koi_certmesh::audit::append_entry_to(
188 &paths.audit_log_path(),
189 "auto_unlock_refused_machine_changed",
190 &[],
191 );
192 tracing::error!(
193 "Certmesh: machine fingerprint changed since CA creation (clone/restore?) — \
194 booting LOCKED; run `koi certmesh unlock` to unlock manually on this host"
195 );
196 }
197
198 // ── Auto-unlock at init: single source of truth ─────────────
199 // The auto-unlock passphrase lives in the koi-crypto vault (written by
200 // CertmeshCore::save_auto_unlock_key_at, which deletes any legacy plaintext file).
201 // Retrieve it through the domain crate so this boot path can never drift from where the
202 // key is actually stored. When a key is present, boot the core already unlocked.
203 if machine_ok {
204 if let Ok(Some(pp)) = koi_certmesh::CertmeshCore::read_auto_unlock_key(&paths) {
205 match koi_certmesh::ca::load_ca(&pp, &paths) {
206 Ok(ca_state) => {
207 // Reload roster (fresh copy for the new Arc)
208 if let Ok(fresh_roster) = koi_certmesh::roster::load_roster(&roster_path) {
209 let auth_path = paths.auth_path();
210 let auth = if auth_path.exists() {
211 std::fs::read_to_string(&auth_path)
212 .ok()
213 .and_then(|json| {
214 serde_json::from_str::<koi_crypto::auth::StoredAuth>(&json).ok()
215 })
216 .and_then(|stored| stored.unlock(&pp).ok())
217 } else {
218 None
219 };
220
221 tracing::info!("Certmesh CA auto-unlocked at init from vault");
222 return Some(Arc::new(koi_certmesh::CertmeshCore::new_with_paths(
223 ca_state,
224 fresh_roster,
225 auth,
226 paths,
227 )));
228 }
229 }
230 Err(e) => {
231 tracing::warn!(
232 error = %e,
233 "Auto-unlock key exists in vault but CA decryption failed"
234 );
235 }
236 }
237 }
238 }
239
240 // No auto-unlock key - boot locked
241 tracing::info!("Certmesh: CA initialized (locked, use `koi certmesh unlock` to decrypt)");
242 let core = koi_certmesh::CertmeshCore::locked_with_paths(roster, paths);
243 Some(Arc::new(core))
244}
245
246/// Build all domain cores + cross-domain bridges, then spawn the caller-invariant domain
247/// background tasks: the runtime orchestrator (when runtime is enabled) and the certmesh
248/// role loops (when certmesh is enabled). Returns the assembled [`Cores`].
249///
250/// The bridges are wired in dependency order: DNS consumes the mDNS/certmesh/alias bridges;
251/// health consumes the mDNS/DNS/certmesh/proxy bridges. Disabled capabilities pass `None`.
252pub async fn build_cores(
253 spec: &CoreSpec,
254 cancel: &CancellationToken,
255 tasks: &mut Vec<JoinHandle<()>>,
256) -> Result<Cores, BuildCoresError> {
257 // ── mDNS ──
258 let mdns_core = if !spec.no_mdns {
259 match koi_mdns::MdnsCore::with_cancel(cancel.clone()) {
260 Ok(core) => Some(Arc::new(core)),
261 Err(e) => {
262 if spec.fail_fast {
263 return Err(e.into());
264 }
265 tracing::error!(error = %e, "Failed to initialize mDNS core");
266 None
267 }
268 }
269 } else {
270 tracing::info!("mDNS capability: disabled");
271 None
272 };
273
274 // ── Certmesh ──
275 // The CA vault auto-unlock runs an Argon2id KDF (seconds on modest hardware), so run it on
276 // a blocking thread rather than stalling the async executor — the daemon gains this too.
277 let certmesh_core = if !spec.no_certmesh {
278 let data_dir = spec.data_dir.clone();
279 match tokio::task::spawn_blocking(move || init_certmesh_core(data_dir.as_deref())).await {
280 Ok(core) => core,
281 Err(e) => {
282 if spec.fail_fast {
283 return Err(BuildCoresError::CertmeshInit(e.to_string()));
284 }
285 tracing::error!(error = %e, "certmesh init task panicked");
286 None
287 }
288 }
289 } else {
290 tracing::info!("Certmesh capability: disabled");
291 None
292 };
293
294 // ── Integration bridges ──
295 // These wrap domain cores and implement cross-domain traits from koi_common::integration.
296 let mdns_bridge: Option<Arc<dyn MdnsSnapshot>> = if let Some(ref core) = mdns_core {
297 Some(crate::bridges::MdnsBridge::spawn(core.clone()).await)
298 } else {
299 None
300 };
301
302 let certmesh_bridge: Option<Arc<dyn CertmeshSnapshot>> = certmesh_core
303 .as_ref()
304 .map(|core| crate::bridges::CertmeshBridge::new(core.clone()) as Arc<dyn CertmeshSnapshot>);
305
306 let alias_feedback: Option<Arc<dyn AliasFeedback>> = certmesh_core.as_ref().map(|core| {
307 crate::bridges::AliasFeedbackBridge::new(core.clone()) as Arc<dyn AliasFeedback>
308 });
309
310 // ── DNS (consumes mdns + certmesh + alias bridges) ──
311 let dns_runtime = if !spec.no_dns {
312 // Pin the DNS state path when the caller supplied one (embedded pins it to its data
313 // dir to stay immune to KOI_DATA_DIR env races in parallel tests).
314 let mut dns_config = spec.dns_config.clone();
315 if let Some(ref path) = spec.dns_state_path {
316 dns_config.state_path = Some(path.clone());
317 }
318 let core = koi_dns::DnsCore::new(
319 dns_config,
320 mdns_bridge.clone(),
321 certmesh_bridge.clone(),
322 alias_feedback,
323 )
324 .await;
325 match core {
326 Ok(core) => {
327 let runtime = Arc::new(koi_dns::DnsRuntime::new(core));
328 if spec.dns_auto_start {
329 if let Err(e) = runtime.start().await {
330 if spec.fail_fast {
331 return Err(e.into());
332 }
333 tracing::error!(error = %e, "Failed to start DNS server");
334 }
335 }
336 Some(runtime)
337 }
338 Err(e) => {
339 if spec.fail_fast {
340 return Err(e.into());
341 }
342 tracing::error!(error = %e, "Failed to initialize DNS core");
343 None
344 }
345 }
346 } else {
347 tracing::info!("DNS capability: disabled");
348 None
349 };
350
351 // ── Proxy ──
352 let proxy_runtime = if !spec.no_proxy {
353 // Scope the proxy state to the caller's data dir when supplied (embedded), else use
354 // the platform-default state (the daemon).
355 let core = match spec.proxy_data_dir {
356 Some(ref dir) => koi_proxy::ProxyCore::with_data_dir(dir),
357 None => koi_proxy::ProxyCore::new(),
358 };
359 match core {
360 Ok(core) => {
361 let runtime = Arc::new(koi_proxy::ProxyRuntime::new(Arc::new(core)));
362 if spec.proxy_auto_start {
363 if let Err(e) = runtime.start_all().await {
364 if spec.fail_fast {
365 return Err(e.into());
366 }
367 tracing::error!(error = %e, "Failed to start proxy listeners");
368 }
369 }
370 Some(runtime)
371 }
372 Err(e) => {
373 if spec.fail_fast {
374 return Err(e.into());
375 }
376 tracing::error!(error = %e, "Failed to initialize proxy core");
377 None
378 }
379 }
380 } else {
381 tracing::info!("Proxy capability: disabled");
382 None
383 };
384
385 let dns_bridge: Option<Arc<dyn DnsProbe>> = dns_runtime
386 .as_ref()
387 .map(|rt| crate::bridges::DnsBridge::new(rt.clone()) as Arc<dyn DnsProbe>);
388
389 let proxy_bridge: Option<Arc<dyn ProxySnapshot>> = proxy_runtime
390 .as_ref()
391 .map(|rt| crate::bridges::ProxyBridge::new(rt.core()) as Arc<dyn ProxySnapshot>);
392
393 // ── Health (consumes mdns + dns + certmesh + proxy bridges) ──
394 let health_runtime = if !spec.no_health {
395 let core = Arc::new(
396 koi_health::HealthCore::new(
397 mdns_bridge.clone(),
398 dns_bridge,
399 certmesh_bridge,
400 proxy_bridge,
401 )
402 .await,
403 );
404 let runtime = Arc::new(koi_health::HealthRuntime::new(core));
405 if spec.health_auto_start {
406 if let Err(e) = runtime.start().await {
407 if spec.fail_fast {
408 return Err(e.into());
409 }
410 tracing::error!(error = %e, "Failed to start health checks");
411 }
412 }
413 Some(runtime)
414 } else {
415 tracing::info!("Health capability: disabled");
416 None
417 };
418
419 // ── UDP ──
420 let udp_runtime = if !spec.no_udp {
421 Some(Arc::new(koi_udp::UdpRuntime::new(cancel.clone())))
422 } else {
423 tracing::info!("UDP capability: disabled");
424 None
425 };
426
427 // ── Runtime adapter ──
428 let runtime_core = if !spec.no_runtime {
429 // No silent fallback: an unrecognized backend selector disables the
430 // runtime adapter rather than quietly running Auto. The CLI rejects bad
431 // values at parse time; this guards the service/env path.
432 match koi_runtime::RuntimeBackendKind::from_str_loose(&spec.runtime) {
433 Some(backend_kind) => {
434 let rt_config = koi_runtime::RuntimeConfig {
435 backend_kind,
436 socket_path: None,
437 };
438 let core = Arc::new(koi_runtime::RuntimeCore::new(rt_config));
439 match core.start_watching(cancel.clone()).await {
440 Ok(()) => Some(core),
441 Err(e) => {
442 tracing::warn!(error = %e, "Runtime adapter unavailable, continuing without it");
443 None
444 }
445 }
446 }
447 None => {
448 tracing::error!(
449 value = %spec.runtime,
450 accepted = ?koi_runtime::RuntimeBackendKind::ACCEPTED,
451 "Unknown runtime backend; disabling runtime adapter"
452 );
453 None
454 }
455 }
456 } else {
457 tracing::info!("Runtime capability: disabled");
458 None
459 };
460
461 // ── Runtime orchestrator ──
462 // Translates container lifecycle events into mDNS/DNS/health/proxy operations. The
463 // daemon always spawns it; embedded opts in (a leaf host wants only the event stream).
464 if spec.spawn_orchestrator {
465 if let Some(ref rt) = runtime_core {
466 tasks.push(crate::orchestrator::spawn_orchestrator(
467 rt,
468 crate::orchestrator::OrchestrationTargets {
469 mdns: mdns_core.clone(),
470 dns: dns_runtime.clone(),
471 health: health_runtime.clone(),
472 proxy: proxy_runtime.clone(),
473 },
474 cancel.clone(),
475 ));
476 }
477 }
478
479 let cores = Cores {
480 mdns: mdns_core,
481 certmesh: certmesh_core,
482 dns: dns_runtime,
483 health: health_runtime,
484 proxy: proxy_runtime,
485 udp: udp_runtime,
486 runtime: runtime_core,
487 mdns_snapshot: mdns_bridge,
488 };
489
490 // ── Certmesh role background loops ──
491 // The daemon always runs them; embedded opts in (a leaf does not need renewal/pull). The
492 // approval pump is spawned by the caller in every case (its decider differs by host).
493 if spec.spawn_certmesh_loops {
494 if let Some(ref certmesh) = cores.certmesh {
495 crate::certmesh::spawn_certmesh_background_tasks(certmesh, cancel, tasks);
496 }
497 }
498
499 tracing::debug!("Domain cores built");
500 Ok(cores)
501}
502
503/// Ordered teardown: cancel → drain in-flight → join tasks → core goodbye (mDNS, DNS, health,
504/// proxy, UDP). Bounded by `timeout`. The self-announce and trust-plane supervisors (in
505/// `tasks`) own their mDNS records and withdraw them on cancel — during the task-join step
506/// here — so no announce id is threaded through this teardown.
507pub async fn ordered_shutdown(
508 cancel: &CancellationToken,
509 tasks: Vec<JoinHandle<()>>,
510 cores: &Cores,
511 timeout: Duration,
512 drain: Duration,
513) {
514 let shutdown = async {
515 cancel.cancel();
516 tokio::time::sleep(drain).await;
517 for task in tasks {
518 let _ = task.await;
519 }
520 if let Some(ref core) = cores.mdns {
521 if let Err(e) = core.shutdown().await {
522 tracing::warn!(error = %e, "Error during mDNS shutdown");
523 }
524 }
525 if let Some(ref dns) = cores.dns {
526 dns.stop().await;
527 }
528 if let Some(ref health) = cores.health {
529 let _ = health.stop().await;
530 }
531 if let Some(ref proxy) = cores.proxy {
532 let _ = proxy.stop_all().await;
533 }
534 if let Some(ref udp) = cores.udp {
535 udp.shutdown().await;
536 }
537 };
538 if tokio::time::timeout(timeout, shutdown).await.is_err() {
539 tracing::warn!("Shutdown timed out after {:?} - forcing exit", timeout);
540 }
541}
542
543#[cfg(test)]
544mod tests {
545 use super::*;
546 use koi_certmesh::{CertmeshCore, CertmeshPaths};
547
548 /// Regression guard for ADR-017 F11: the **real boot path** (`init_certmesh_core`,
549 /// not the unused `try_auto_unlock`) must refuse to auto-unlock when the machine
550 /// fingerprint changed since CA creation (a VM clone / disk restore). The fix
551 /// that wires `machine_binding_ok` into this path is exactly what an earlier
552 /// implementation missed — this test ensures it can't silently un-wire again.
553 #[tokio::test]
554 async fn init_certmesh_core_refuses_auto_unlock_on_machine_change() {
555 let base = koi_common::test::ensure_data_dir("koi-compose-cores-tests").join("f11-boot");
556 let _ = std::fs::remove_dir_all(&base);
557 let paths = CertmeshPaths::with_data_dir(base.clone());
558
559 // Create a CA with auto-unlock — records the vault key + machine.bind.
560 let core = CertmeshCore::uninitialized_with_paths(paths.clone());
561 core.create(koi_certmesh::protocol::CreateCaRequest {
562 passphrase: "f11-boot-pass".into(),
563 entropy_hex: "11".repeat(32),
564 operator: None,
565 enrollment_open: true,
566 requires_approval: false,
567 auto_unlock: true,
568 totp_secret_hex: None,
569 })
570 .await
571 .expect("CA create");
572
573 // Same host (machine.bind matches) → the boot path auto-unlocks.
574 let booted = init_certmesh_core(Some(&base)).expect("core");
575 assert!(
576 !booted.certmesh_status().await.ca_locked,
577 "matching machine binding should auto-unlock at boot"
578 );
579
580 // Simulate a clone/restore: overwrite the recorded fingerprint. The boot
581 // path must now refuse auto-unlock and come up LOCKED.
582 std::fs::write(paths.machine_bind_path(), b"not-this-host-fingerprint").unwrap();
583 let booted_after = init_certmesh_core(Some(&base)).expect("core");
584 assert!(
585 booted_after.certmesh_status().await.ca_locked,
586 "a changed machine fingerprint must refuse auto-unlock at boot (F11)"
587 );
588
589 let _ = std::fs::remove_dir_all(&base);
590 }
591}