trusty-mpm 0.8.0

trusty-mpm: unified multi-agent orchestration platform (core, daemon, CLI, TUI, Telegram)
//! Supervisor metrics HTTP server: `/metrics` and `/health`.
//!
//! Why: the unattended supervisor exposes fleet state so an operator or a
//! higher-level fleet manager can poll it without attaching to any session. A
//! tiny axum server serving a JSON snapshot satisfies the "metrics exposed on a
//! health endpoint" acceptance criterion while keeping the surface minimal.
//! What: holds a shared [`MetricsHandle`] (an `Arc<RwLock<FleetMetrics>>`) that
//! the supervisor loop updates after each sweep; [`router`] builds the axum
//! [`Router`]; [`bind`] reserves the port (fail-fast) and [`serve_on`] serves it
//! on the bound listener. Gated behind the `daemon` feature because axum is only
//! a dependency there (per the workspace axum-feature rule).
//! Test: `metrics_endpoint_returns_snapshot`, `health_endpoint_ok` in `super::tests`.

use std::net::SocketAddr;
use std::sync::Arc;

use axum::{Json, Router, extract::State, routing::get};
use serde::Serialize;
use tokio::sync::RwLock;
use tracing::info;

use super::metrics::FleetMetrics;

/// Shared, lock-guarded fleet-metrics snapshot.
///
/// Why: the loop writes a fresh snapshot after each sweep while HTTP handlers read
/// it concurrently; an `Arc<RwLock<…>>` gives many readers / one writer with no
/// blocking on the (frequent) read path.
/// What: a type alias for the shared handle passed to both the loop and the router.
/// Test: exercised by `metrics_endpoint_returns_snapshot`.
pub type MetricsHandle = Arc<RwLock<FleetMetrics>>;

/// Build a fresh, empty metrics handle.
///
/// Why: the supervisor needs one shared snapshot created before either the loop or
/// the server starts so both reference the same cell.
/// What: wraps a default [`FleetMetrics`] in `Arc<RwLock<…>>`.
/// Test: used by the loop wiring and `metrics_endpoint_returns_snapshot`.
pub fn new_handle() -> MetricsHandle {
    Arc::new(RwLock::new(FleetMetrics::default()))
}

/// Health-check response body.
///
/// Why: `/health` returns a stable, machine-checkable shape so liveness probes
/// (launchd KeepAlive checks, monitoring) can assert `status == "ok"`.
/// What: a one-field struct serialized as `{"status":"ok"}`.
/// Test: `health_endpoint_ok`.
#[derive(Debug, Serialize)]
struct Health {
    /// Always `"ok"` while the server is serving.
    status: &'static str,
}

/// Build the supervisor's metrics router over a shared [`MetricsHandle`].
///
/// Why: separating router construction from `serve` lets tests drive the handlers
/// in-process (via `tower::ServiceExt::oneshot`) without binding a socket.
/// What: returns a [`Router`] with `GET /health` and `GET /metrics`, carrying the
/// handle as axum state.
/// Test: `metrics_endpoint_returns_snapshot`, `health_endpoint_ok`.
pub fn router(handle: MetricsHandle) -> Router {
    Router::new()
        .route("/health", get(health))
        .route("/metrics", get(metrics))
        .with_state(handle)
}

/// `GET /health` — liveness probe.
///
/// Why: an always-on supervisor needs a trivially-cheap endpoint a process
/// supervisor can hit to confirm it is alive.
/// What: returns `{"status":"ok"}` with a 200.
/// Test: `health_endpoint_ok`.
async fn health() -> Json<Health> {
    Json(Health { status: "ok" })
}

/// `GET /metrics` — current fleet snapshot.
///
/// Why: the single endpoint a human / fleet manager polls to see counts by
/// lifecycle state, surfaced pending decisions, last activity, and supervisor run
/// stats.
/// What: clones the current [`FleetMetrics`] out from under the read lock and
/// returns it as JSON.
/// Test: `metrics_endpoint_returns_snapshot`.
async fn metrics(State(handle): State<MetricsHandle>) -> Json<FleetMetrics> {
    let snapshot = handle.read().await.clone();
    Json(snapshot)
}

/// Bind a `TcpListener` for the metrics server on `addr`.
///
/// Why: the supervisor must fail *fast* if the metrics port is already in use —
/// otherwise it would run for hours with no `/metrics` and only a buried log line.
/// Binding before the loop starts (and propagating the error) turns a silent
/// degradation into an immediate startup failure the operator sees.
/// What: binds and returns a `TcpListener`, surfacing any bind error to the
/// caller with the offending address as context.
/// Test: covered indirectly by `run_supervisor` (bind-before-loop); the failure
/// path mirrors the daemon's own listener bind.
pub async fn bind(addr: SocketAddr) -> anyhow::Result<tokio::net::TcpListener> {
    use anyhow::Context as _;
    tokio::net::TcpListener::bind(addr)
        .await
        .with_context(|| format!("binding supervisor metrics server to {addr}"))
}

/// Serve the metrics router on an already-bound listener until the task is dropped.
///
/// Why: separating bind from serve lets the caller bind first (failing fast on a
/// port collision) and only then spawn the serving task, so a bind error can be
/// propagated synchronously rather than lost inside a detached `tokio::spawn`.
/// What: logs the bound address and serves [`router`] on `listener`; returns an
/// error only if serving subsequently fails.
/// Test: handlers are unit-tested via the router; the serve path mirrors the
/// daemon's own `serve_http`.
pub async fn serve_on(
    listener: tokio::net::TcpListener,
    handle: MetricsHandle,
) -> anyhow::Result<()> {
    let local = listener.local_addr().ok();
    if let Some(addr) = local {
        info!("supervisor metrics listening on http://{addr}/metrics");
    }
    axum::serve(listener, router(handle)).await?;
    Ok(())
}