decompose 0.2.1

A simple and flexible scheduler and orchestrator to manage non-containerized applications
Documentation
use std::collections::BTreeMap;
use std::path::PathBuf;
use std::sync::{Arc, RwLock};

use serde::{Deserialize, Serialize};

/// Shared cell holding the current instance name for a running process.
///
/// Daemon tasks (lifecycle, output readers, probes) capture a clone of this
/// handle rather than a plain `String` so that the daemon can rename an
/// instance in place (e.g. scaling from `replicas == 1` to `replicas >= 2`
/// promotes `foo` to `foo[1]`) without invalidating the task's view of which
/// entry in `DaemonState.processes` belongs to it.
///
/// Writes happen under the daemon state lock as part of a reload. Reads are
/// cheap and lock-free on the hot path (they use a standard `RwLock`).
pub type NameHandle = Arc<RwLock<String>>;

/// Construct a new [`NameHandle`] from a starting name.
pub fn make_name_handle(name: String) -> NameHandle {
    Arc::new(RwLock::new(name))
}

/// Read the current name out of a handle. Panics only if the `RwLock` is
/// poisoned, which would indicate a panic in a daemon task under the lock
/// and is not expected in normal operation.
pub fn read_name(handle: &NameHandle) -> String {
    handle.read().expect("NameHandle RwLock poisoned").clone()
}

#[derive(Debug, Deserialize, Serialize, Clone, Copy, Default, PartialEq, Eq)]
#[serde(rename_all = "snake_case")]
pub enum DependencyCondition {
    #[default]
    ProcessStarted,
    ProcessCompleted,
    ProcessCompletedSuccessfully,
    ProcessHealthy,
    ProcessLogReady,
}

#[derive(Debug, Clone, Copy, Default, PartialEq, Eq, Deserialize, Serialize)]
#[serde(rename_all = "snake_case")]
pub enum RestartPolicy {
    #[default]
    No,
    OnFailure,
    Always,
}

#[derive(Debug, Clone, Copy, Default, PartialEq, Eq, Deserialize, Serialize)]
#[serde(rename_all = "snake_case")]
pub enum ExitMode {
    #[default]
    WaitAll,
    ExitOnFailure,
    ExitOnEnd,
}

#[derive(Debug, Clone)]
pub struct ProcessInstanceSpec {
    pub name: String,
    pub base_name: String,
    pub replica: u16,
    pub command: String,
    pub description: Option<String>,
    pub working_dir: PathBuf,
    pub environment: BTreeMap<String, String>,
    pub depends_on: BTreeMap<String, DependencyCondition>,
    pub ready_log_line: Option<String>,
    pub restart_policy: RestartPolicy,
    pub backoff_seconds: u64,
    pub max_restarts: Option<u32>,
    pub shutdown_signal: Option<i32>,
    pub shutdown_timeout_seconds: u64,
    pub shutdown_command: Option<String>,
    pub readiness_probe: Option<HealthProbe>,
    pub liveness_probe: Option<HealthProbe>,
    pub disabled: bool,
    /// Stable SHA-256 hash of the service's `ProcessConfig`, excluding the
    /// fields that Docker Compose considers changeable-without-recreate
    /// (`depends_on`, `replicas`, `disabled`). Shared across replicas of the
    /// same service. Used by reload to diff services and decide which need
    /// to be restarted. Computed once in `build_process_instances`.
    pub config_hash: String,
}

/// User-facing definition of a readiness or liveness probe. Exactly one of
/// `exec` and `http_get` must be set (validated when the config is loaded).
/// Threshold semantics mirror Kubernetes: a probe must succeed
/// `success_threshold` consecutive times to flip the flag on, and fail
/// `failure_threshold` consecutive times to flip it off.
#[derive(Debug, Clone, Deserialize, Serialize)]
pub struct HealthProbe {
    /// Run a shell command; exit code 0 = success, non-zero = failure. Most
    /// useful for probes that need access to the service's environment.
    #[serde(default)]
    pub exec: Option<ExecCheck>,
    /// Make an HTTP GET request; any 2xx response = success.
    #[serde(default)]
    pub http_get: Option<HttpCheck>,
    /// Interval between probe attempts, in seconds.
    #[serde(default = "default_period")]
    pub period_seconds: u64,
    /// Per-attempt timeout, in seconds. Exceeding it counts as a failure.
    #[serde(default = "default_timeout")]
    pub timeout_seconds: u64,
    /// Delay before the first probe runs, giving the process time to come
    /// up. Counted from `Running` transition, not from probe scheduling.
    #[serde(default = "default_initial_delay")]
    pub initial_delay_seconds: u64,
    /// Consecutive successes required before the probe flips the
    /// `ready`/`alive` flag on. Defaults to 1.
    #[serde(default = "default_success_threshold")]
    pub success_threshold: u32,
    /// Consecutive failures required before the probe flips the flag off
    /// (and, for liveness, triggers a SIGKILL restart). Defaults to 3.
    #[serde(default = "default_failure_threshold")]
    pub failure_threshold: u32,
}

fn default_period() -> u64 {
    10
}

fn default_timeout() -> u64 {
    1
}

fn default_initial_delay() -> u64 {
    0
}

fn default_success_threshold() -> u32 {
    1
}

fn default_failure_threshold() -> u32 {
    3
}

/// `exec` probe: a shell command whose exit code is interpreted as
/// success (`0`) or failure (non-zero). Runs with the same environment as
/// the parent service, so tools like `curl localhost:$PORT/health` pick up
/// the interpolated `PORT`.
#[derive(Debug, Clone, Deserialize, Serialize)]
pub struct ExecCheck {
    /// Shell command to execute via `sh -c`. Subject to the same `${VAR}`
    /// interpolation as the service `command`.
    pub command: String,
}

/// `http_get` probe: a synchronous HTTP request; any 2xx response counts as
/// success.
#[derive(Debug, Clone, Deserialize, Serialize)]
pub struct HttpCheck {
    /// Hostname or IP to connect to. Defaults to `127.0.0.1`.
    #[serde(default = "default_host")]
    pub host: String,
    /// TCP port. Required.
    pub port: u16,
    /// `http` or `https`. Defaults to `http`.
    #[serde(default = "default_scheme")]
    pub scheme: String,
    /// Path component of the URL. Defaults to `/`.
    #[serde(default = "default_path")]
    pub path: String,
}

fn default_host() -> String {
    "127.0.0.1".to_string()
}

fn default_scheme() -> String {
    "http".to_string()
}

fn default_path() -> String {
    "/".to_string()
}

/// Lifecycle state of a single process instance, as tracked by the daemon.
/// Transitions are driven by the supervisor loop and the per-process
/// `process_lifecycle` task; clients see a serialised projection via
/// [`ProcessSnapshot`].
#[derive(Debug, Clone)]
pub enum ProcessStatus {
    /// Defined in config but not yet selected for launch (e.g. not part of
    /// the initial `up` subset). Will not be started by the supervisor until
    /// explicitly requested via `start` or `up`.
    NotStarted,
    /// Selected to run but waiting on `depends_on` conditions or for the
    /// supervisor's next tick to spawn the child.
    Pending,
    /// Live child process; `pid` is the OS PID of the immediate child (the
    /// shell, since commands are executed via `sh -c`).
    Running { pid: u32 },
    /// Child exited on its own. `code` is the OS exit code (`-N` for
    /// signal-terminated processes follows the standard mapping).
    Exited { code: i32 },
    /// Spawn or pre-spawn failed (bad command, missing working dir, etc.).
    /// `reason` is the contextualised error from the spawn attempt.
    FailedToStart { reason: String },
    /// Stopped by an explicit client request (`stop`, `down`, scale-down,
    /// reload removal). Distinct from `Exited` so reload diffing can tell
    /// the two apart.
    Stopped,
    /// Transient state between `Exited`/SIGKILL-after-liveness-failure and
    /// the next spawn under a `restart_policy` other than `no`.
    Restarting,
    /// Service has `disabled: true` in config; visible in `ps` and
    /// `config` output but never spawned.
    Disabled,
}

impl ProcessStatus {
    pub fn to_human(&self) -> String {
        match self {
            ProcessStatus::NotStarted => "not_started".to_string(),
            ProcessStatus::Pending => "pending".to_string(),
            ProcessStatus::Running { pid } => format!("running(pid={pid})"),
            ProcessStatus::Exited { code } => format!("exited(code={code})"),
            ProcessStatus::FailedToStart { reason } => format!("failed_to_start({reason})"),
            ProcessStatus::Stopped => "stopped".to_string(),
            ProcessStatus::Restarting => "restarting".to_string(),
            ProcessStatus::Disabled => "disabled".to_string(),
        }
    }

    pub fn to_json_status(&self) -> &'static str {
        match self {
            ProcessStatus::NotStarted => "not_started",
            ProcessStatus::Pending => "pending",
            ProcessStatus::Running { .. } => "running",
            ProcessStatus::Exited { code: 0 } => "exited",
            ProcessStatus::Exited { .. } => "failed",
            ProcessStatus::FailedToStart { .. } => "failed",
            ProcessStatus::Stopped => "stopped",
            ProcessStatus::Restarting => "restarting",
            ProcessStatus::Disabled => "disabled",
        }
    }

    pub fn is_terminal(&self) -> bool {
        matches!(
            self,
            ProcessStatus::NotStarted
                | ProcessStatus::Exited { .. }
                | ProcessStatus::FailedToStart { .. }
                | ProcessStatus::Stopped
                | ProcessStatus::Disabled
        )
    }
}

#[derive(Debug, Clone)]
pub struct ProcessRuntime {
    pub spec: ProcessInstanceSpec,
    pub status: ProcessStatus,
    pub started_once: bool,
    pub log_ready: bool,
    pub restart_count: u32,
    /// Readiness flag: set by the readiness probe's success threshold,
    /// cleared by the failure threshold or on process (re)start. This is
    /// what `depends_on: process_healthy` gates on and what `ps` reports
    /// as the HEALTH/STATE indicator.
    ///
    /// Starts `false` on a fresh process; without a configured readiness
    /// probe it remains `false` (see `dependencies_met` for how the
    /// dependency condition handles that case).
    pub ready: bool,
    /// Liveness flag: set by the liveness probe's success threshold, and
    /// cleared when the failure threshold trips (at which point the daemon
    /// SIGKILLs the process so the restart policy re-launches it).
    ///
    /// Defaults to `true` — a process with no liveness probe is assumed
    /// alive. Reset to `true` on (re)start so a new instance starts its
    /// probe cycle with a clean slate.
    pub alive: bool,
    /// Shared-by-reference current instance name. Daemon tasks spawned for
    /// this runtime hold `Arc` clones of this handle. When the daemon
    /// renames an instance in place during a scale 1↔N transition, it
    /// updates this cell (and re-keys the `processes` / `controllers`
    /// maps); tasks then continue looking up their own entry under the
    /// new name without restart.
    pub name_handle: NameHandle,
}

#[derive(Debug, Clone)]
pub struct RuntimePaths {
    pub socket: PathBuf,
    pub pid: PathBuf,
    pub daemon_log: PathBuf,
    pub lock: PathBuf,
}

#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct ProcessSnapshot {
    pub name: String,
    pub base: String,
    pub replica: u16,
    pub status: String,
    /// Unified status field for JSON consumers.
    pub state: String,
    pub description: Option<String>,
    pub restart_count: u32,
    pub log_ready: bool,
    /// Readiness-probe pass/fail flag. Drives `depends_on: process_healthy`
    /// and the `ps` HEALTH/STATE column. Without a readiness probe the
    /// daemon leaves this `false`; see `has_readiness_probe` for whether
    /// the service actually has one configured.
    pub ready: bool,
    /// Liveness-probe pass/fail flag. A failing liveness probe triggers a
    /// process restart via SIGKILL. Services without a liveness probe
    /// default to `true` (assumed alive).
    pub alive: bool,
    pub has_readiness_probe: bool,
    /// Whether a liveness probe is configured on the service. Exposed so
    /// downstream consumers can distinguish "no probe, assumed alive" from
    /// "probe configured and passing".
    pub has_liveness_probe: bool,
    pub pid: Option<u32>,
    pub exit_code: Option<i32>,
}

impl From<&ProcessRuntime> for ProcessSnapshot {
    /// Project a `ProcessRuntime` into its serializable `ProcessSnapshot`
    /// form. Both structs share every overlapping field (including the
    /// `has_readiness_probe` / `has_liveness_probe` bits derived from
    /// `spec`), so this mapping is fully mechanical. Keeping it in one
    /// place prevents drift when fields are added to either side.
    fn from(runtime: &ProcessRuntime) -> Self {
        let exit_code = match &runtime.status {
            ProcessStatus::Exited { code } => Some(*code),
            _ => None,
        };
        let pid = match &runtime.status {
            ProcessStatus::Running { pid } => Some(*pid),
            _ => None,
        };
        ProcessSnapshot {
            name: runtime.spec.name.clone(),
            base: runtime.spec.base_name.clone(),
            replica: runtime.spec.replica,
            status: runtime.status.to_human(),
            state: runtime.status.to_json_status().to_string(),
            description: runtime.spec.description.clone(),
            restart_count: runtime.restart_count,
            log_ready: runtime.log_ready,
            ready: runtime.ready,
            alive: runtime.alive,
            has_readiness_probe: runtime.spec.readiness_probe.is_some(),
            has_liveness_probe: runtime.spec.liveness_probe.is_some(),
            pid,
            exit_code,
        }
    }
}