1#![doc = r"
2Provider-agnostic cloud node provisioning for `ZLayer` autoscaling.
3
4This crate defines the [`CloudProvisioner`] trait and the value types used to
5request and describe worker nodes that join a `ZLayer` cluster. The trait plus the
6core value types depend only on `async-trait`, `serde`, and `thiserror`, so a
7downstream consumer (for example a `ZataCloudDeploy` backend) can implement
8the trait against its own cloud SDK without pulling in the reference
9implementation's runtime dependencies.
10
11The built-in [`CloudInitProvisioner`] (behind the default `cloud-init` feature)
12is a provider-agnostic implementation that shells out to operator-supplied
13commands and feeds each node a cloud-init `#cloud-config` that runs
14`zlayer node join` on boot. It requires no cloud SDK.
15
16# Identifiers
17
18[`ProviderNodeId`] is the provider-scoped identifier for a node (for example an
19`EC2` instance id or an opaque token printed by a provisioning script). It is
20deliberately distinct from the raft layer's numeric node id.
21"]
22
23use std::collections::BTreeMap;
24
25pub type ProviderNodeId = String;
31
32#[derive(Clone, Copy, Debug, Default, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
35pub enum CapacityType {
36 #[default]
38 OnDemand,
39 Spot,
41}
42
43#[derive(Clone, Debug, serde::Serialize, serde::Deserialize)]
47pub struct NodeShape {
48 pub cpu: f64,
50 pub memory_bytes: u64,
52 pub gpu: u32,
54 pub gpu_vendor: Option<String>,
56 pub labels: BTreeMap<String, String>,
58 pub zone: Option<String>,
60 pub capacity_type: CapacityType,
62}
63
64impl Default for NodeShape {
65 fn default() -> Self {
66 Self {
67 cpu: 1.0,
68 memory_bytes: 1024 * 1024 * 1024,
69 gpu: 0,
70 gpu_vendor: None,
71 labels: BTreeMap::new(),
72 zone: None,
73 capacity_type: CapacityType::OnDemand,
74 }
75 }
76}
77
78impl NodeShape {
79 #[must_use]
82 pub fn new(cpu: f64, memory_bytes: u64) -> Self {
83 Self {
84 cpu,
85 memory_bytes,
86 ..Self::default()
87 }
88 }
89}
90
91#[derive(Clone, Copy, Debug, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
93pub enum JoinState {
94 Provisioning,
96 Booting,
98 Joining,
100 Joined,
102 Failed,
104}
105
106#[derive(Clone, Debug, serde::Serialize, serde::Deserialize)]
108pub struct NodeHandle {
109 pub provider_id: ProviderNodeId,
111 pub address: Option<String>,
113 pub zone: Option<String>,
115 pub capacity_type: CapacityType,
117 pub join_state: JoinState,
119}
120
121#[derive(Clone, Debug, serde::Serialize, serde::Deserialize)]
123pub struct PriceHint {
124 pub hourly_usd: f64,
126 pub capacity_type: CapacityType,
128}
129
130#[derive(thiserror::Error, Debug)]
132pub enum ProvisionerError {
133 #[error("unsupported: {0}")]
135 Unsupported(String),
136 #[error("capacity unavailable: {0}")]
138 Capacity(String),
139 #[error("auth: {0}")]
141 Auth(String),
142 #[error("transport: {0}")]
144 Transport(String),
145 #[error("{0}")]
147 Other(String),
148}
149
150pub type Result<T> = std::result::Result<T, ProvisionerError>;
152
153#[async_trait::async_trait]
161pub trait CloudProvisioner: Send + Sync {
162 async fn provision(&self, shape: &NodeShape) -> Result<NodeHandle>;
169
170 #[allow(clippy::ptr_arg)]
180 async fn terminate(&self, id: &ProviderNodeId) -> Result<()>;
181
182 async fn describe(&self) -> Result<Vec<NodeHandle>>;
188
189 fn capacity_types(&self) -> &[CapacityType];
191
192 fn price_hint(&self, shape: &NodeShape) -> Option<PriceHint>;
194
195 fn name(&self) -> &str;
197}
198
199#[cfg(feature = "cloud-init")]
200pub mod cloud_init;
201#[cfg(feature = "cloud-init")]
202pub use cloud_init::{CloudInitConfig, CloudInitProvisioner};
203
204#[cfg(test)]
205mod tests {
206 use super::{
207 CapacityType, JoinState, NodeHandle, NodeShape, PriceHint, ProviderNodeId, ProvisionerError,
208 };
209 use serde::{Deserialize, Serialize};
210
211 fn assert_serde<T: Serialize + for<'de> Deserialize<'de>>() {}
215
216 #[test]
217 fn value_types_implement_serde() {
218 assert_serde::<NodeShape>();
219 assert_serde::<NodeHandle>();
220 assert_serde::<PriceHint>();
221 assert_serde::<CapacityType>();
222 assert_serde::<JoinState>();
223 }
224
225 #[test]
226 fn node_shape_default_is_one_cpu_one_gib() {
227 let shape = NodeShape::default();
228 assert!((shape.cpu - 1.0).abs() < f64::EPSILON);
229 assert_eq!(shape.memory_bytes, 1024 * 1024 * 1024);
230 assert_eq!(shape.gpu, 0);
231 assert!(shape.gpu_vendor.is_none());
232 assert!(shape.labels.is_empty());
233 assert!(shape.zone.is_none());
234 assert_eq!(shape.capacity_type, CapacityType::OnDemand);
235 }
236
237 #[test]
238 fn node_shape_new_sets_cpu_and_memory() {
239 let shape = NodeShape::new(4.0, 8 * 1024 * 1024 * 1024);
240 assert!((shape.cpu - 4.0).abs() < f64::EPSILON);
241 assert_eq!(shape.memory_bytes, 8 * 1024 * 1024 * 1024);
242 assert_eq!(shape.capacity_type, CapacityType::OnDemand);
243 }
244
245 #[test]
246 fn capacity_type_default_is_on_demand() {
247 assert_eq!(CapacityType::default(), CapacityType::OnDemand);
248 }
249
250 #[test]
251 fn node_shape_clone_preserves_fields() {
252 let mut shape = NodeShape::new(2.0, 4 * 1024 * 1024 * 1024);
253 shape.gpu = 1;
254 shape.gpu_vendor = Some("nvidia".to_string());
255 shape.zone = Some("us-east-1a".to_string());
256 shape.capacity_type = CapacityType::Spot;
257 shape
258 .labels
259 .insert("role".to_string(), "worker".to_string());
260
261 let back = shape.clone();
262 assert!((back.cpu - shape.cpu).abs() < f64::EPSILON);
263 assert_eq!(back.memory_bytes, shape.memory_bytes);
264 assert_eq!(back.gpu, shape.gpu);
265 assert_eq!(back.gpu_vendor, shape.gpu_vendor);
266 assert_eq!(back.zone, shape.zone);
267 assert_eq!(back.capacity_type, shape.capacity_type);
268 assert_eq!(back.labels, shape.labels);
269 }
270
271 #[test]
272 fn node_handle_carries_state() {
273 let handle = NodeHandle {
274 provider_id: "i-0123".to_string(),
275 address: Some("10.0.0.5".to_string()),
276 zone: Some("us-east-1a".to_string()),
277 capacity_type: CapacityType::Spot,
278 join_state: JoinState::Joining,
279 };
280 assert_eq!(handle.provider_id, "i-0123");
281 assert_eq!(handle.join_state, JoinState::Joining);
282 assert_eq!(handle.capacity_type, CapacityType::Spot);
283 }
284
285 #[test]
286 fn price_hint_carries_fields() {
287 let hint = PriceHint {
288 hourly_usd: 0.42,
289 capacity_type: CapacityType::OnDemand,
290 };
291 assert!((hint.hourly_usd - 0.42).abs() < f64::EPSILON);
292 assert_eq!(hint.capacity_type, CapacityType::OnDemand);
293 }
294
295 #[test]
296 fn provider_node_id_is_string() {
297 let id: ProviderNodeId = "node-7".to_string();
298 assert_eq!(id, "node-7");
299 }
300
301 #[test]
302 fn provisioner_error_display() {
303 assert_eq!(
304 ProvisionerError::Capacity("none left".to_string()).to_string(),
305 "capacity unavailable: none left"
306 );
307 assert_eq!(
308 ProvisionerError::Unsupported("gpu".to_string()).to_string(),
309 "unsupported: gpu"
310 );
311 assert_eq!(
312 ProvisionerError::Other("boom".to_string()).to_string(),
313 "boom"
314 );
315 }
316}