zlayer-provisioner 0.14.1

Provider-agnostic cloud node provisioning trait + cloud-init reference impl
Documentation
#![doc = r"
Provider-agnostic cloud node provisioning for `ZLayer` autoscaling.

This crate defines the [`CloudProvisioner`] trait and the value types used to
request and describe worker nodes that join a `ZLayer` cluster. The trait plus the
core value types depend only on `async-trait`, `serde`, and `thiserror`, so a
downstream consumer (for example a `ZataCloudDeploy` backend) can implement
the trait against its own cloud SDK without pulling in the reference
implementation's runtime dependencies.

The built-in [`CloudInitProvisioner`] (behind the default `cloud-init` feature)
is a provider-agnostic implementation that shells out to operator-supplied
commands and feeds each node a cloud-init `#cloud-config` that runs
`zlayer node join` on boot. It requires no cloud SDK.

# Identifiers

[`ProviderNodeId`] is the provider-scoped identifier for a node (for example an
`EC2` instance id or an opaque token printed by a provisioning script). It is
deliberately distinct from the raft layer's numeric node id.
"]

use std::collections::BTreeMap;

/// Provider-scoped node identifier.
///
/// This is whatever the provisioning backend uses to name a node (an instance
/// id, a VM name, an opaque token, ...). It is intentionally distinct from the
/// raft layer's numeric `NodeId`.
pub type ProviderNodeId = String;

/// Whether a node is billed as standard on-demand capacity or as interruptible
/// spot/preemptible capacity.
#[derive(Clone, Copy, Debug, Default, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
pub enum CapacityType {
    /// Standard, non-interruptible capacity.
    #[default]
    OnDemand,
    /// Interruptible, discounted capacity (spot / preemptible).
    Spot,
}

/// Desired shape of a node to provision.
///
/// Construct with [`NodeShape::new`] and refine the optional fields directly.
#[derive(Clone, Debug, serde::Serialize, serde::Deserialize)]
pub struct NodeShape {
    /// Requested vCPU count (fractional allowed, e.g. for burstable shapes).
    pub cpu: f64,
    /// Requested memory in bytes.
    pub memory_bytes: u64,
    /// Requested GPU count (0 if none).
    pub gpu: u32,
    /// Optional GPU vendor hint (e.g. `"nvidia"`, `"amd"`).
    pub gpu_vendor: Option<String>,
    /// Arbitrary scheduling labels to attach to the node.
    pub labels: BTreeMap<String, String>,
    /// Optional availability zone / placement hint.
    pub zone: Option<String>,
    /// Desired billing capacity type.
    pub capacity_type: CapacityType,
}

impl Default for NodeShape {
    fn default() -> Self {
        Self {
            cpu: 1.0,
            memory_bytes: 1024 * 1024 * 1024,
            gpu: 0,
            gpu_vendor: None,
            labels: BTreeMap::new(),
            zone: None,
            capacity_type: CapacityType::OnDemand,
        }
    }
}

impl NodeShape {
    /// Create a [`NodeShape`] with the given CPU and memory, defaulting the
    /// remaining fields (no GPU, no labels, no zone, on-demand capacity).
    #[must_use]
    pub fn new(cpu: f64, memory_bytes: u64) -> Self {
        Self {
            cpu,
            memory_bytes,
            ..Self::default()
        }
    }
}

/// Lifecycle state of a provisioned node as it works toward cluster membership.
#[derive(Clone, Copy, Debug, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
pub enum JoinState {
    /// The provider is creating the underlying machine.
    Provisioning,
    /// The machine exists and is booting.
    Booting,
    /// The node agent is attempting to join the cluster.
    Joining,
    /// The node has joined the cluster.
    Joined,
    /// Provisioning or joining failed.
    Failed,
}

/// A handle to a provisioned (or in-flight) node.
#[derive(Clone, Debug, serde::Serialize, serde::Deserialize)]
pub struct NodeHandle {
    /// Provider-scoped identifier for the node.
    pub provider_id: ProviderNodeId,
    /// Reachable address (IP or hostname), once known.
    pub address: Option<String>,
    /// Availability zone the node landed in, if known.
    pub zone: Option<String>,
    /// Billing capacity type the node was provisioned as.
    pub capacity_type: CapacityType,
    /// Current lifecycle state.
    pub join_state: JoinState,
}

/// A best-effort price estimate for a given shape.
#[derive(Clone, Debug, serde::Serialize, serde::Deserialize)]
pub struct PriceHint {
    /// Estimated hourly cost in US dollars.
    pub hourly_usd: f64,
    /// Capacity type the estimate applies to.
    pub capacity_type: CapacityType,
}

/// Errors returned by a [`CloudProvisioner`].
#[derive(thiserror::Error, Debug)]
pub enum ProvisionerError {
    /// The requested operation or shape is not supported by this provider.
    #[error("unsupported: {0}")]
    Unsupported(String),
    /// The provider could not satisfy the request due to lack of capacity.
    #[error("capacity unavailable: {0}")]
    Capacity(String),
    /// Authentication or authorization with the provider failed.
    #[error("auth: {0}")]
    Auth(String),
    /// A transport / IO error occurred while talking to the provider.
    #[error("transport: {0}")]
    Transport(String),
    /// Any other error.
    #[error("{0}")]
    Other(String),
}

/// Convenience result alias for provisioner operations.
pub type Result<T> = std::result::Result<T, ProvisionerError>;

/// A pluggable backend that can create, destroy, and enumerate `ZLayer` nodes.
///
/// Implementations are expected to be cheap to clone or share (the trait
/// requires `Send + Sync`). Methods that talk to a remote provider are async;
/// the introspection methods ([`capacity_types`](CloudProvisioner::capacity_types),
/// [`price_hint`](CloudProvisioner::price_hint), [`name`](CloudProvisioner::name))
/// are synchronous and must not block.
#[async_trait::async_trait]
pub trait CloudProvisioner: Send + Sync {
    /// Provision a node matching `shape`.
    ///
    /// # Errors
    ///
    /// Returns [`ProvisionerError`] if the provider rejects the request, lacks
    /// capacity, fails authentication, or the underlying transport fails.
    async fn provision(&self, shape: &NodeShape) -> Result<NodeHandle>;

    /// Terminate the node identified by `id`.
    ///
    /// # Errors
    ///
    /// Returns [`ProvisionerError`] if the provider fails to terminate the node
    /// or the transport fails.
    // `ProviderNodeId` is a `String` alias by design (provider ids are owned),
    // so the `&ProviderNodeId` contract is intentional even though it resolves
    // to `&String`.
    #[allow(clippy::ptr_arg)]
    async fn terminate(&self, id: &ProviderNodeId) -> Result<()>;

    /// Enumerate the nodes currently known to this provisioner.
    ///
    /// # Errors
    ///
    /// Returns [`ProvisionerError`] if the provider cannot be queried.
    async fn describe(&self) -> Result<Vec<NodeHandle>>;

    /// The capacity types this provisioner supports.
    fn capacity_types(&self) -> &[CapacityType];

    /// A best-effort price estimate for `shape`, if one can be computed.
    fn price_hint(&self, shape: &NodeShape) -> Option<PriceHint>;

    /// A short, stable, human-readable name for this provisioner backend.
    fn name(&self) -> &str;
}

#[cfg(feature = "cloud-init")]
pub mod cloud_init;
#[cfg(feature = "cloud-init")]
pub use cloud_init::{CloudInitConfig, CloudInitProvisioner};

#[cfg(test)]
mod tests {
    use super::{
        CapacityType, JoinState, NodeHandle, NodeShape, PriceHint, ProviderNodeId, ProvisionerError,
    };
    use serde::{Deserialize, Serialize};

    /// Compile-time assertion that the value types implement `serde` traits.
    /// (No format crate is a dependency of this crate, so we exercise the
    /// derives statically rather than round-tripping through JSON/YAML.)
    fn assert_serde<T: Serialize + for<'de> Deserialize<'de>>() {}

    #[test]
    fn value_types_implement_serde() {
        assert_serde::<NodeShape>();
        assert_serde::<NodeHandle>();
        assert_serde::<PriceHint>();
        assert_serde::<CapacityType>();
        assert_serde::<JoinState>();
    }

    #[test]
    fn node_shape_default_is_one_cpu_one_gib() {
        let shape = NodeShape::default();
        assert!((shape.cpu - 1.0).abs() < f64::EPSILON);
        assert_eq!(shape.memory_bytes, 1024 * 1024 * 1024);
        assert_eq!(shape.gpu, 0);
        assert!(shape.gpu_vendor.is_none());
        assert!(shape.labels.is_empty());
        assert!(shape.zone.is_none());
        assert_eq!(shape.capacity_type, CapacityType::OnDemand);
    }

    #[test]
    fn node_shape_new_sets_cpu_and_memory() {
        let shape = NodeShape::new(4.0, 8 * 1024 * 1024 * 1024);
        assert!((shape.cpu - 4.0).abs() < f64::EPSILON);
        assert_eq!(shape.memory_bytes, 8 * 1024 * 1024 * 1024);
        assert_eq!(shape.capacity_type, CapacityType::OnDemand);
    }

    #[test]
    fn capacity_type_default_is_on_demand() {
        assert_eq!(CapacityType::default(), CapacityType::OnDemand);
    }

    #[test]
    fn node_shape_clone_preserves_fields() {
        let mut shape = NodeShape::new(2.0, 4 * 1024 * 1024 * 1024);
        shape.gpu = 1;
        shape.gpu_vendor = Some("nvidia".to_string());
        shape.zone = Some("us-east-1a".to_string());
        shape.capacity_type = CapacityType::Spot;
        shape
            .labels
            .insert("role".to_string(), "worker".to_string());

        let back = shape.clone();
        assert!((back.cpu - shape.cpu).abs() < f64::EPSILON);
        assert_eq!(back.memory_bytes, shape.memory_bytes);
        assert_eq!(back.gpu, shape.gpu);
        assert_eq!(back.gpu_vendor, shape.gpu_vendor);
        assert_eq!(back.zone, shape.zone);
        assert_eq!(back.capacity_type, shape.capacity_type);
        assert_eq!(back.labels, shape.labels);
    }

    #[test]
    fn node_handle_carries_state() {
        let handle = NodeHandle {
            provider_id: "i-0123".to_string(),
            address: Some("10.0.0.5".to_string()),
            zone: Some("us-east-1a".to_string()),
            capacity_type: CapacityType::Spot,
            join_state: JoinState::Joining,
        };
        assert_eq!(handle.provider_id, "i-0123");
        assert_eq!(handle.join_state, JoinState::Joining);
        assert_eq!(handle.capacity_type, CapacityType::Spot);
    }

    #[test]
    fn price_hint_carries_fields() {
        let hint = PriceHint {
            hourly_usd: 0.42,
            capacity_type: CapacityType::OnDemand,
        };
        assert!((hint.hourly_usd - 0.42).abs() < f64::EPSILON);
        assert_eq!(hint.capacity_type, CapacityType::OnDemand);
    }

    #[test]
    fn provider_node_id_is_string() {
        let id: ProviderNodeId = "node-7".to_string();
        assert_eq!(id, "node-7");
    }

    #[test]
    fn provisioner_error_display() {
        assert_eq!(
            ProvisionerError::Capacity("none left".to_string()).to_string(),
            "capacity unavailable: none left"
        );
        assert_eq!(
            ProvisionerError::Unsupported("gpu".to_string()).to_string(),
            "unsupported: gpu"
        );
        assert_eq!(
            ProvisionerError::Other("boom".to_string()).to_string(),
            "boom"
        );
    }
}