Skip to main content

alien_core/resources/
compute_cluster.rs

1//! ComputeCluster resource for long-running container workloads.
2//!
3//! A ComputeCluster represents the setup-owned compute boundary for containers.
4//! Setup provisions:
5//! - Auto Scaling Groups (AWS), Managed Instance Groups (GCP), or VM Scale Sets (Azure)
6//! - IAM roles/service accounts for machine authentication
7//! - Security groups/firewall rules
8//! - Launch templates/instance configurations
9
10use crate::error::{ErrorData, Result};
11use crate::resource::{ResourceDefinition, ResourceOutputsDefinition, ResourceRef};
12use crate::ResourceType;
13use alien_error::AlienError;
14use bon::Builder;
15use serde::{Deserialize, Serialize};
16use std::any::Any;
17use std::fmt::Debug;
18
19/// GPU specification for a capacity group.
20#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
21#[cfg_attr(feature = "openapi", derive(utoipa::ToSchema))]
22#[serde(rename_all = "camelCase")]
23pub struct GpuSpec {
24    /// GPU type identifier (e.g., "nvidia-a100", "nvidia-t4")
25    #[serde(rename = "type")]
26    pub gpu_type: String,
27    /// Number of GPUs per machine
28    pub count: u32,
29}
30
31/// Machine resource profile for a capacity group.
32///
33/// Represents the hardware specifications for machines in a capacity group.
34/// These are hardware totals (what the instance type advertises), not allocatable
35/// capacity. The managed container scheduler internally subtracts system reserves for planning.
36#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
37#[cfg_attr(feature = "openapi", derive(utoipa::ToSchema))]
38#[serde(rename_all = "camelCase")]
39pub struct MachineProfile {
40    /// CPU cores per machine (hardware total) - stored as string to preserve precision
41    /// (e.g., "8.0", "4.5")
42    pub cpu: String,
43    /// Memory in bytes (hardware total)
44    pub memory_bytes: u64,
45    /// Ephemeral storage in bytes (hardware total)
46    pub ephemeral_storage_bytes: u64,
47    /// GPU specification (optional)
48    #[serde(skip_serializing_if = "Option::is_none")]
49    pub gpu: Option<GpuSpec>,
50}
51
52/// Capacity group definition.
53///
54/// A capacity group represents machines with identical hardware profiles.
55/// Each group becomes a separate Auto Scaling Group (AWS), Managed Instance Group (GCP),
56/// or VM Scale Set (Azure).
57#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
58#[cfg_attr(feature = "openapi", derive(utoipa::ToSchema))]
59#[serde(rename_all = "camelCase")]
60pub struct CapacityGroup {
61    /// Unique identifier for this capacity group (must be lowercase alphanumeric with hyphens)
62    pub group_id: String,
63    /// Instance type for machines in this group (e.g., "m7g.xlarge", "n2-standard-8")
64    /// Auto-selected if not specified, based on profile requirements.
65    #[serde(skip_serializing_if = "Option::is_none")]
66    pub instance_type: Option<String>,
67    /// Machine resource profile (auto-derived from instance_type if not specified)
68    #[serde(skip_serializing_if = "Option::is_none")]
69    pub profile: Option<MachineProfile>,
70    /// Minimum number of machines (can be 0 for scale-to-zero)
71    pub min_size: u32,
72    /// Maximum number of machines (must be ≤ 10)
73    pub max_size: u32,
74}
75
76/// ComputeCluster resource for running long-running container workloads.
77///
78/// A ComputeCluster provides the setup-owned machine boundary for containers.
79/// Alien may manage the worker fleet inside that boundary when setup grants
80/// `compute-cluster/management`.
81///
82/// ## Architecture
83///
84/// - **Setup** creates cloud resources: ASGs/MIGs/VMSSs, IAM roles, security groups
85/// - **Alien** manages allowed fleet operations: machine count and Horizon machine image rollout
86/// - **horizond** runs on each machine from the selected Horizon machine image channel
87///
88/// ## Example
89///
90/// ```rust
91/// use alien_core::{ComputeCluster, CapacityGroup};
92///
93/// let cluster = ComputeCluster::new("compute".to_string())
94///     .capacity_group(CapacityGroup {
95///         group_id: "general".to_string(),
96///         instance_type: Some("m7g.xlarge".to_string()),
97///         profile: None,
98///         min_size: 1,
99///         max_size: 5,
100///     })
101///     .build();
102/// ```
103#[derive(Debug, Clone, PartialEq, Serialize, Deserialize, Builder)]
104#[cfg_attr(feature = "openapi", derive(utoipa::ToSchema))]
105#[serde(rename_all = "camelCase", deny_unknown_fields)]
106#[builder(start_fn = new)]
107pub struct ComputeCluster {
108    /// Unique identifier for the container cluster.
109    /// Must contain only alphanumeric characters, hyphens, and underscores.
110    #[builder(start_fn)]
111    pub id: String,
112
113    /// Capacity groups defining the machine pools for this cluster.
114    /// Each group becomes a separate ASG/MIG/VMSS.
115    #[builder(field)]
116    pub capacity_groups: Vec<CapacityGroup>,
117
118    /// Container CIDR block for internal container networking.
119    /// Auto-generated as "10.244.0.0/16" if not specified.
120    /// Each machine gets a /24 subnet from this range.
121    #[serde(skip_serializing_if = "Option::is_none")]
122    pub container_cidr: Option<String>,
123}
124
125impl ComputeCluster {
126    /// The resource type identifier for ComputeCluster
127    pub const RESOURCE_TYPE: ResourceType = ResourceType::from_static("compute-cluster");
128
129    /// Returns the cluster's unique identifier.
130    pub fn id(&self) -> &str {
131        &self.id
132    }
133
134    /// Returns the container CIDR, defaulting to "10.244.0.0/16" if not specified.
135    pub fn container_cidr(&self) -> &str {
136        self.container_cidr.as_deref().unwrap_or("10.244.0.0/16")
137    }
138}
139
140impl<S: compute_cluster_builder::State> ComputeClusterBuilder<S> {
141    /// Adds a capacity group to the cluster.
142    pub fn capacity_group(mut self, group: CapacityGroup) -> Self {
143        self.capacity_groups.push(group);
144        self
145    }
146}
147
148/// Status of a single capacity group within a ComputeCluster.
149#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
150#[cfg_attr(feature = "openapi", derive(utoipa::ToSchema))]
151#[serde(rename_all = "camelCase")]
152pub struct CapacityGroupStatus {
153    /// Capacity group ID
154    pub group_id: String,
155    /// Current number of machines
156    pub current_machines: u32,
157    /// Desired number of machines (from the managed container capacity plan)
158    pub desired_machines: u32,
159    /// Instance type being used
160    pub instance_type: String,
161}
162
163/// Outputs generated by a successfully provisioned ComputeCluster.
164#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
165#[cfg_attr(feature = "openapi", derive(utoipa::ToSchema))]
166#[serde(rename_all = "camelCase")]
167pub struct ComputeClusterOutputs {
168    /// Managed container cluster ID (workspace/project/deployment/resourceid format)
169    pub cluster_id: String,
170    /// Whether the managed container cluster is ready
171    pub horizon_ready: bool,
172    /// Status of each capacity group
173    pub capacity_group_statuses: Vec<CapacityGroupStatus>,
174    /// Total number of machines across all capacity groups
175    pub total_machines: u32,
176}
177
178impl ResourceOutputsDefinition for ComputeClusterOutputs {
179    fn get_resource_type(&self) -> ResourceType {
180        ComputeCluster::RESOURCE_TYPE.clone()
181    }
182
183    fn as_any(&self) -> &dyn Any {
184        self
185    }
186
187    fn box_clone(&self) -> Box<dyn ResourceOutputsDefinition> {
188        Box::new(self.clone())
189    }
190
191    fn outputs_eq(&self, other: &dyn ResourceOutputsDefinition) -> bool {
192        other.as_any().downcast_ref::<ComputeClusterOutputs>() == Some(self)
193    }
194
195    fn to_json_value(&self) -> serde_json::Result<serde_json::Value> {
196        serde_json::to_value(self)
197    }
198}
199
200impl ResourceDefinition for ComputeCluster {
201    fn get_resource_type(&self) -> ResourceType {
202        Self::RESOURCE_TYPE
203    }
204
205    fn id(&self) -> &str {
206        &self.id
207    }
208
209    fn get_dependencies(&self) -> Vec<ResourceRef> {
210        // ComputeCluster has no static dependencies.
211        // Network dependency is platform-specific:
212        // - AWS/GCP/Azure: Added by ComputeClusterMutation
213        // - Local/Kubernetes: Not needed (Docker/K8s handles networking)
214        // Platform controllers use require_dependency() at runtime to access Network state.
215        Vec::new()
216    }
217
218    fn validate_update(&self, new_config: &dyn ResourceDefinition) -> Result<()> {
219        let new_cluster = new_config
220            .as_any()
221            .downcast_ref::<ComputeCluster>()
222            .ok_or_else(|| {
223                AlienError::new(ErrorData::UnexpectedResourceType {
224                    resource_id: self.id.clone(),
225                    expected: Self::RESOURCE_TYPE,
226                    actual: new_config.get_resource_type(),
227                })
228            })?;
229
230        if self.id != new_cluster.id {
231            return Err(AlienError::new(ErrorData::InvalidResourceUpdate {
232                resource_id: self.id.clone(),
233                reason: "the 'id' field is immutable".to_string(),
234            }));
235        }
236
237        // Container CIDR is immutable once set
238        if self.container_cidr.is_some()
239            && new_cluster.container_cidr.is_some()
240            && self.container_cidr != new_cluster.container_cidr
241        {
242            return Err(AlienError::new(ErrorData::InvalidResourceUpdate {
243                resource_id: self.id.clone(),
244                reason: "the 'containerCidr' field is immutable once set".to_string(),
245            }));
246        }
247
248        // Validate capacity groups
249        for new_group in &new_cluster.capacity_groups {
250            if let Some(existing_group) = self
251                .capacity_groups
252                .iter()
253                .find(|g| g.group_id == new_group.group_id)
254            {
255                // Instance type is immutable for existing groups
256                if existing_group.instance_type.is_some()
257                    && new_group.instance_type.is_some()
258                    && existing_group.instance_type != new_group.instance_type
259                {
260                    return Err(AlienError::new(ErrorData::InvalidResourceUpdate {
261                        resource_id: self.id.clone(),
262                        reason: format!(
263                            "instance type for capacity group '{}' is immutable",
264                            new_group.group_id
265                        ),
266                    }));
267                }
268            }
269        }
270
271        Ok(())
272    }
273
274    fn as_any(&self) -> &dyn Any {
275        self
276    }
277
278    fn as_any_mut(&mut self) -> &mut dyn Any {
279        self
280    }
281
282    fn box_clone(&self) -> Box<dyn ResourceDefinition> {
283        Box::new(self.clone())
284    }
285
286    fn resource_eq(&self, other: &dyn ResourceDefinition) -> bool {
287        other.as_any().downcast_ref::<ComputeCluster>() == Some(self)
288    }
289
290    fn to_json_value(&self) -> serde_json::Result<serde_json::Value> {
291        serde_json::to_value(self)
292    }
293}
294
295#[cfg(test)]
296mod tests {
297    use super::*;
298
299    #[test]
300    fn test_compute_cluster_creation() {
301        let cluster = ComputeCluster::new("compute".to_string())
302            .capacity_group(CapacityGroup {
303                group_id: "general".to_string(),
304                instance_type: Some("m7g.xlarge".to_string()),
305                profile: None,
306                min_size: 1,
307                max_size: 5,
308            })
309            .build();
310
311        assert_eq!(cluster.id(), "compute");
312        assert_eq!(cluster.capacity_groups.len(), 1);
313        assert_eq!(cluster.capacity_groups[0].group_id, "general");
314        assert_eq!(cluster.container_cidr(), "10.244.0.0/16");
315    }
316
317    #[test]
318    fn test_compute_cluster_multiple_capacity_groups() {
319        let cluster = ComputeCluster::new("multi-pool".to_string())
320            .capacity_group(CapacityGroup {
321                group_id: "general".to_string(),
322                instance_type: Some("m7g.xlarge".to_string()),
323                profile: None,
324                min_size: 1,
325                max_size: 3,
326            })
327            .capacity_group(CapacityGroup {
328                group_id: "gpu".to_string(),
329                instance_type: Some("g5.xlarge".to_string()),
330                profile: Some(MachineProfile {
331                    cpu: "4.0".to_string(),
332                    memory_bytes: 17179869184,             // 16 GiB
333                    ephemeral_storage_bytes: 214748364800, // 200 GiB
334                    gpu: Some(GpuSpec {
335                        gpu_type: "nvidia-a10g".to_string(),
336                        count: 1,
337                    }),
338                }),
339                min_size: 0,
340                max_size: 2,
341            })
342            .build();
343
344        assert_eq!(cluster.capacity_groups.len(), 2);
345        assert_eq!(cluster.capacity_groups[0].group_id, "general");
346        assert_eq!(cluster.capacity_groups[1].group_id, "gpu");
347        assert!(cluster.capacity_groups[1]
348            .profile
349            .as_ref()
350            .unwrap()
351            .gpu
352            .is_some());
353    }
354
355    #[test]
356    fn test_compute_cluster_custom_cidr() {
357        let cluster = ComputeCluster::new("custom-net".to_string())
358            .container_cidr("172.30.0.0/16".to_string())
359            .capacity_group(CapacityGroup {
360                group_id: "general".to_string(),
361                instance_type: None,
362                profile: None,
363                min_size: 1,
364                max_size: 5,
365            })
366            .build();
367
368        assert_eq!(cluster.container_cidr(), "172.30.0.0/16");
369    }
370
371    #[test]
372    fn test_compute_cluster_validate_update_immutable_id() {
373        let cluster1 = ComputeCluster::new("cluster-1".to_string())
374            .capacity_group(CapacityGroup {
375                group_id: "general".to_string(),
376                instance_type: None,
377                profile: None,
378                min_size: 1,
379                max_size: 5,
380            })
381            .build();
382
383        let cluster2 = ComputeCluster::new("cluster-2".to_string())
384            .capacity_group(CapacityGroup {
385                group_id: "general".to_string(),
386                instance_type: None,
387                profile: None,
388                min_size: 1,
389                max_size: 5,
390            })
391            .build();
392
393        let result = cluster1.validate_update(&cluster2);
394        assert!(result.is_err());
395    }
396
397    #[test]
398    fn test_compute_cluster_validate_update_scale_change() {
399        let cluster1 = ComputeCluster::new("compute".to_string())
400            .capacity_group(CapacityGroup {
401                group_id: "general".to_string(),
402                instance_type: Some("m7g.xlarge".to_string()),
403                profile: None,
404                min_size: 1,
405                max_size: 5,
406            })
407            .build();
408
409        let cluster2 = ComputeCluster::new("compute".to_string())
410            .capacity_group(CapacityGroup {
411                group_id: "general".to_string(),
412                instance_type: Some("m7g.xlarge".to_string()),
413                profile: None,
414                min_size: 2,
415                max_size: 10,
416            })
417            .build();
418
419        // Scale changes should be allowed
420        let result = cluster1.validate_update(&cluster2);
421        assert!(result.is_ok());
422    }
423
424    #[test]
425    fn test_compute_cluster_serialization() {
426        let cluster = ComputeCluster::new("test-cluster".to_string())
427            .capacity_group(CapacityGroup {
428                group_id: "general".to_string(),
429                instance_type: Some("m7g.xlarge".to_string()),
430                profile: None,
431                min_size: 1,
432                max_size: 5,
433            })
434            .build();
435
436        let json = serde_json::to_string(&cluster).unwrap();
437        let deserialized: ComputeCluster = serde_json::from_str(&json).unwrap();
438        assert_eq!(cluster, deserialized);
439    }
440}