Skip to main content

zlayer_provisioner/
lib.rs

1#![doc = r"
2Provider-agnostic cloud node provisioning for `ZLayer` autoscaling.
3
4This crate defines the [`CloudProvisioner`] trait and the value types used to
5request and describe worker nodes that join a `ZLayer` cluster. The trait plus the
6core value types depend only on `async-trait`, `serde`, and `thiserror`, so a
7downstream consumer (for example a `ZataCloudDeploy` backend) can implement
8the trait against its own cloud SDK without pulling in the reference
9implementation's runtime dependencies.
10
11The built-in [`CloudInitProvisioner`] (behind the default `cloud-init` feature)
12is a provider-agnostic implementation that shells out to operator-supplied
13commands and feeds each node a cloud-init `#cloud-config` that runs
14`zlayer node join` on boot. It requires no cloud SDK.
15
16# Identifiers
17
18[`ProviderNodeId`] is the provider-scoped identifier for a node (for example an
19`EC2` instance id or an opaque token printed by a provisioning script). It is
20deliberately distinct from the raft layer's numeric node id.
21"]
22
23use std::collections::BTreeMap;
24
25/// Provider-scoped node identifier.
26///
27/// This is whatever the provisioning backend uses to name a node (an instance
28/// id, a VM name, an opaque token, ...). It is intentionally distinct from the
29/// raft layer's numeric `NodeId`.
30pub type ProviderNodeId = String;
31
32/// Whether a node is billed as standard on-demand capacity or as interruptible
33/// spot/preemptible capacity.
34#[derive(Clone, Copy, Debug, Default, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
35pub enum CapacityType {
36    /// Standard, non-interruptible capacity.
37    #[default]
38    OnDemand,
39    /// Interruptible, discounted capacity (spot / preemptible).
40    Spot,
41}
42
43/// Desired shape of a node to provision.
44///
45/// Construct with [`NodeShape::new`] and refine the optional fields directly.
46#[derive(Clone, Debug, serde::Serialize, serde::Deserialize)]
47pub struct NodeShape {
48    /// Requested vCPU count (fractional allowed, e.g. for burstable shapes).
49    pub cpu: f64,
50    /// Requested memory in bytes.
51    pub memory_bytes: u64,
52    /// Requested GPU count (0 if none).
53    pub gpu: u32,
54    /// Optional GPU vendor hint (e.g. `"nvidia"`, `"amd"`).
55    pub gpu_vendor: Option<String>,
56    /// Arbitrary scheduling labels to attach to the node.
57    pub labels: BTreeMap<String, String>,
58    /// Optional availability zone / placement hint.
59    pub zone: Option<String>,
60    /// Desired billing capacity type.
61    pub capacity_type: CapacityType,
62}
63
64impl Default for NodeShape {
65    fn default() -> Self {
66        Self {
67            cpu: 1.0,
68            memory_bytes: 1024 * 1024 * 1024,
69            gpu: 0,
70            gpu_vendor: None,
71            labels: BTreeMap::new(),
72            zone: None,
73            capacity_type: CapacityType::OnDemand,
74        }
75    }
76}
77
78impl NodeShape {
79    /// Create a [`NodeShape`] with the given CPU and memory, defaulting the
80    /// remaining fields (no GPU, no labels, no zone, on-demand capacity).
81    #[must_use]
82    pub fn new(cpu: f64, memory_bytes: u64) -> Self {
83        Self {
84            cpu,
85            memory_bytes,
86            ..Self::default()
87        }
88    }
89}
90
91/// Lifecycle state of a provisioned node as it works toward cluster membership.
92#[derive(Clone, Copy, Debug, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
93pub enum JoinState {
94    /// The provider is creating the underlying machine.
95    Provisioning,
96    /// The machine exists and is booting.
97    Booting,
98    /// The node agent is attempting to join the cluster.
99    Joining,
100    /// The node has joined the cluster.
101    Joined,
102    /// Provisioning or joining failed.
103    Failed,
104}
105
106/// A handle to a provisioned (or in-flight) node.
107#[derive(Clone, Debug, serde::Serialize, serde::Deserialize)]
108pub struct NodeHandle {
109    /// Provider-scoped identifier for the node.
110    pub provider_id: ProviderNodeId,
111    /// Reachable address (IP or hostname), once known.
112    pub address: Option<String>,
113    /// Availability zone the node landed in, if known.
114    pub zone: Option<String>,
115    /// Billing capacity type the node was provisioned as.
116    pub capacity_type: CapacityType,
117    /// Current lifecycle state.
118    pub join_state: JoinState,
119}
120
121/// A best-effort price estimate for a given shape.
122#[derive(Clone, Debug, serde::Serialize, serde::Deserialize)]
123pub struct PriceHint {
124    /// Estimated hourly cost in US dollars.
125    pub hourly_usd: f64,
126    /// Capacity type the estimate applies to.
127    pub capacity_type: CapacityType,
128}
129
130/// Errors returned by a [`CloudProvisioner`].
131#[derive(thiserror::Error, Debug)]
132pub enum ProvisionerError {
133    /// The requested operation or shape is not supported by this provider.
134    #[error("unsupported: {0}")]
135    Unsupported(String),
136    /// The provider could not satisfy the request due to lack of capacity.
137    #[error("capacity unavailable: {0}")]
138    Capacity(String),
139    /// Authentication or authorization with the provider failed.
140    #[error("auth: {0}")]
141    Auth(String),
142    /// A transport / IO error occurred while talking to the provider.
143    #[error("transport: {0}")]
144    Transport(String),
145    /// Any other error.
146    #[error("{0}")]
147    Other(String),
148}
149
150/// Convenience result alias for provisioner operations.
151pub type Result<T> = std::result::Result<T, ProvisionerError>;
152
153/// A pluggable backend that can create, destroy, and enumerate `ZLayer` nodes.
154///
155/// Implementations are expected to be cheap to clone or share (the trait
156/// requires `Send + Sync`). Methods that talk to a remote provider are async;
157/// the introspection methods ([`capacity_types`](CloudProvisioner::capacity_types),
158/// [`price_hint`](CloudProvisioner::price_hint), [`name`](CloudProvisioner::name))
159/// are synchronous and must not block.
160#[async_trait::async_trait]
161pub trait CloudProvisioner: Send + Sync {
162    /// Provision a node matching `shape`.
163    ///
164    /// # Errors
165    ///
166    /// Returns [`ProvisionerError`] if the provider rejects the request, lacks
167    /// capacity, fails authentication, or the underlying transport fails.
168    async fn provision(&self, shape: &NodeShape) -> Result<NodeHandle>;
169
170    /// Terminate the node identified by `id`.
171    ///
172    /// # Errors
173    ///
174    /// Returns [`ProvisionerError`] if the provider fails to terminate the node
175    /// or the transport fails.
176    // `ProviderNodeId` is a `String` alias by design (provider ids are owned),
177    // so the `&ProviderNodeId` contract is intentional even though it resolves
178    // to `&String`.
179    #[allow(clippy::ptr_arg)]
180    async fn terminate(&self, id: &ProviderNodeId) -> Result<()>;
181
182    /// Enumerate the nodes currently known to this provisioner.
183    ///
184    /// # Errors
185    ///
186    /// Returns [`ProvisionerError`] if the provider cannot be queried.
187    async fn describe(&self) -> Result<Vec<NodeHandle>>;
188
189    /// The capacity types this provisioner supports.
190    fn capacity_types(&self) -> &[CapacityType];
191
192    /// A best-effort price estimate for `shape`, if one can be computed.
193    fn price_hint(&self, shape: &NodeShape) -> Option<PriceHint>;
194
195    /// A short, stable, human-readable name for this provisioner backend.
196    fn name(&self) -> &str;
197}
198
199#[cfg(feature = "cloud-init")]
200pub mod cloud_init;
201#[cfg(feature = "cloud-init")]
202pub use cloud_init::{CloudInitConfig, CloudInitProvisioner};
203
204#[cfg(test)]
205mod tests {
206    use super::{
207        CapacityType, JoinState, NodeHandle, NodeShape, PriceHint, ProviderNodeId, ProvisionerError,
208    };
209    use serde::{Deserialize, Serialize};
210
211    /// Compile-time assertion that the value types implement `serde` traits.
212    /// (No format crate is a dependency of this crate, so we exercise the
213    /// derives statically rather than round-tripping through JSON/YAML.)
214    fn assert_serde<T: Serialize + for<'de> Deserialize<'de>>() {}
215
216    #[test]
217    fn value_types_implement_serde() {
218        assert_serde::<NodeShape>();
219        assert_serde::<NodeHandle>();
220        assert_serde::<PriceHint>();
221        assert_serde::<CapacityType>();
222        assert_serde::<JoinState>();
223    }
224
225    #[test]
226    fn node_shape_default_is_one_cpu_one_gib() {
227        let shape = NodeShape::default();
228        assert!((shape.cpu - 1.0).abs() < f64::EPSILON);
229        assert_eq!(shape.memory_bytes, 1024 * 1024 * 1024);
230        assert_eq!(shape.gpu, 0);
231        assert!(shape.gpu_vendor.is_none());
232        assert!(shape.labels.is_empty());
233        assert!(shape.zone.is_none());
234        assert_eq!(shape.capacity_type, CapacityType::OnDemand);
235    }
236
237    #[test]
238    fn node_shape_new_sets_cpu_and_memory() {
239        let shape = NodeShape::new(4.0, 8 * 1024 * 1024 * 1024);
240        assert!((shape.cpu - 4.0).abs() < f64::EPSILON);
241        assert_eq!(shape.memory_bytes, 8 * 1024 * 1024 * 1024);
242        assert_eq!(shape.capacity_type, CapacityType::OnDemand);
243    }
244
245    #[test]
246    fn capacity_type_default_is_on_demand() {
247        assert_eq!(CapacityType::default(), CapacityType::OnDemand);
248    }
249
250    #[test]
251    fn node_shape_clone_preserves_fields() {
252        let mut shape = NodeShape::new(2.0, 4 * 1024 * 1024 * 1024);
253        shape.gpu = 1;
254        shape.gpu_vendor = Some("nvidia".to_string());
255        shape.zone = Some("us-east-1a".to_string());
256        shape.capacity_type = CapacityType::Spot;
257        shape
258            .labels
259            .insert("role".to_string(), "worker".to_string());
260
261        let back = shape.clone();
262        assert!((back.cpu - shape.cpu).abs() < f64::EPSILON);
263        assert_eq!(back.memory_bytes, shape.memory_bytes);
264        assert_eq!(back.gpu, shape.gpu);
265        assert_eq!(back.gpu_vendor, shape.gpu_vendor);
266        assert_eq!(back.zone, shape.zone);
267        assert_eq!(back.capacity_type, shape.capacity_type);
268        assert_eq!(back.labels, shape.labels);
269    }
270
271    #[test]
272    fn node_handle_carries_state() {
273        let handle = NodeHandle {
274            provider_id: "i-0123".to_string(),
275            address: Some("10.0.0.5".to_string()),
276            zone: Some("us-east-1a".to_string()),
277            capacity_type: CapacityType::Spot,
278            join_state: JoinState::Joining,
279        };
280        assert_eq!(handle.provider_id, "i-0123");
281        assert_eq!(handle.join_state, JoinState::Joining);
282        assert_eq!(handle.capacity_type, CapacityType::Spot);
283    }
284
285    #[test]
286    fn price_hint_carries_fields() {
287        let hint = PriceHint {
288            hourly_usd: 0.42,
289            capacity_type: CapacityType::OnDemand,
290        };
291        assert!((hint.hourly_usd - 0.42).abs() < f64::EPSILON);
292        assert_eq!(hint.capacity_type, CapacityType::OnDemand);
293    }
294
295    #[test]
296    fn provider_node_id_is_string() {
297        let id: ProviderNodeId = "node-7".to_string();
298        assert_eq!(id, "node-7");
299    }
300
301    #[test]
302    fn provisioner_error_display() {
303        assert_eq!(
304            ProvisionerError::Capacity("none left".to_string()).to_string(),
305            "capacity unavailable: none left"
306        );
307        assert_eq!(
308            ProvisionerError::Unsupported("gpu".to_string()).to_string(),
309            "unsupported: gpu"
310        );
311        assert_eq!(
312            ProvisionerError::Other("boom".to_string()).to_string(),
313            "boom"
314        );
315    }
316}