Skip to main content

alien_core/resources/
compute_cluster.rs

1//! ComputeCluster resource for long-running container workloads.
2//!
3//! A ComputeCluster represents the setup-owned compute boundary for containers.
4//! Setup provisions:
5//! - Auto Scaling Groups (AWS), Managed Instance Groups (GCP), or VM Scale Sets (Azure)
6//! - IAM roles/service accounts for machine authentication
7//! - Security groups/firewall rules
8//! - Launch templates/instance configurations
9
10use crate::error::{ErrorData, Result};
11use crate::instance_catalog::Architecture;
12use crate::resource::{ResourceDefinition, ResourceOutputsDefinition, ResourceRef};
13use crate::ResourceType;
14use alien_error::AlienError;
15use bon::Builder;
16use serde::{Deserialize, Serialize};
17use std::any::Any;
18use std::fmt::Debug;
19
20/// GPU specification for a capacity group.
21#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
22#[cfg_attr(feature = "openapi", derive(utoipa::ToSchema))]
23#[serde(rename_all = "camelCase")]
24pub struct GpuSpec {
25    /// GPU type identifier (e.g., "nvidia-a100", "nvidia-t4")
26    #[serde(rename = "type")]
27    pub gpu_type: String,
28    /// Number of GPUs per machine
29    pub count: u32,
30}
31
32/// Machine resource profile for a capacity group.
33///
34/// Represents the hardware specifications for machines in a capacity group.
35/// These are hardware totals (what the instance type advertises), not allocatable
36/// capacity. The managed container scheduler internally subtracts system reserves for planning.
37#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
38#[cfg_attr(feature = "openapi", derive(utoipa::ToSchema))]
39#[serde(rename_all = "camelCase")]
40pub struct MachineProfile {
41    /// CPU cores per machine (hardware total) - stored as string to preserve precision
42    /// (e.g., "8.0", "4.5")
43    pub cpu: String,
44    /// Memory in bytes (hardware total)
45    pub memory_bytes: u64,
46    /// Ephemeral storage in bytes (hardware total)
47    pub ephemeral_storage_bytes: u64,
48    /// CPU architecture required or provided by this machine profile.
49    #[serde(skip_serializing_if = "Option::is_none")]
50    pub architecture: Option<Architecture>,
51    /// GPU specification (optional)
52    #[serde(skip_serializing_if = "Option::is_none")]
53    pub gpu: Option<GpuSpec>,
54}
55
56/// Allowed range and default for a count selected by the installer.
57#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
58#[cfg_attr(feature = "openapi", derive(utoipa::ToSchema))]
59#[serde(rename_all = "camelCase")]
60pub struct ComputeChoiceRange {
61    /// Lowest allowed value.
62    pub min: u32,
63    /// Highest allowed value.
64    pub max: u32,
65    /// Default value recommended when no installer override is supplied.
66    pub default: u32,
67}
68
69impl ComputeChoiceRange {
70    /// Returns whether a selected value is inside the allowed range.
71    pub fn contains(&self, value: u32) -> bool {
72        self.min <= value && value <= self.max
73    }
74}
75
76/// Source-declared scale policy for a capacity group.
77#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
78#[cfg_attr(feature = "openapi", derive(utoipa::ToSchema))]
79#[serde(rename_all = "camelCase", tag = "type")]
80pub enum CapacityGroupScalePolicy {
81    /// A fixed-size pool where the installer can choose the fixed machine count.
82    Fixed {
83        /// Allowed fixed machine count range.
84        machines: ComputeChoiceRange,
85    },
86    /// An autoscaling pool with separately bounded min and max counts.
87    Autoscale {
88        /// Allowed minimum machine count range.
89        min: ComputeChoiceRange,
90        /// Allowed maximum machine count range.
91        max: ComputeChoiceRange,
92    },
93}
94
95impl CapacityGroupScalePolicy {
96    /// Derive the legacy policy represented by selected min/max values.
97    pub fn from_selected_bounds(min_size: u32, max_size: u32) -> Self {
98        if min_size == max_size {
99            Self::Fixed {
100                machines: ComputeChoiceRange {
101                    min: min_size,
102                    max: max_size,
103                    default: min_size,
104                },
105            }
106        } else {
107            Self::Autoscale {
108                min: ComputeChoiceRange {
109                    min: min_size,
110                    max: min_size,
111                    default: min_size,
112                },
113                max: ComputeChoiceRange {
114                    min: max_size,
115                    max: max_size,
116                    default: max_size,
117                },
118            }
119        }
120    }
121
122    /// Default selected min bound.
123    pub fn default_min_size(&self) -> u32 {
124        match self {
125            Self::Fixed { machines } => machines.default,
126            Self::Autoscale { min, .. } => min.default,
127        }
128    }
129
130    /// Default selected max bound.
131    pub fn default_max_size(&self) -> u32 {
132        match self {
133            Self::Fixed { machines } => machines.default,
134            Self::Autoscale { max, .. } => max.default,
135        }
136    }
137}
138
139/// Capacity group definition.
140///
141/// A capacity group represents machines with identical hardware profiles.
142/// Each group becomes a separate Auto Scaling Group (AWS), Managed Instance Group (GCP),
143/// or VM Scale Set (Azure).
144#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
145#[cfg_attr(feature = "openapi", derive(utoipa::ToSchema))]
146#[serde(rename_all = "camelCase")]
147pub struct CapacityGroup {
148    /// Unique identifier for this capacity group (must be lowercase alphanumeric with hyphens)
149    pub group_id: String,
150    /// Provider machine selected at deployment time.
151    /// `alien.ts` should declare portable requirements; preflight materialization
152    /// fills this field from `StackSettings.compute`.
153    #[serde(skip_serializing_if = "Option::is_none")]
154    pub instance_type: Option<String>,
155    /// Machine resource profile (auto-derived from instance_type if not specified)
156    #[serde(skip_serializing_if = "Option::is_none")]
157    pub profile: Option<MachineProfile>,
158    /// Minimum number of machines (can be 0 for scale-to-zero)
159    pub min_size: u32,
160    /// Maximum number of machines (must be ≤ 10)
161    pub max_size: u32,
162    /// Source-declared scale policy and installer-editable bounds.
163    ///
164    /// Older stacks only have `minSize` and `maxSize`; planners derive an exact
165    /// fixed/autoscale policy from those selected bounds when this field is absent.
166    #[serde(skip_serializing_if = "Option::is_none")]
167    pub scale_policy: Option<CapacityGroupScalePolicy>,
168    /// Require instance types that expose nested virtualization (VT-x/EPT)
169    /// to guest VMs. This is needed by workloads that boot nested VMs inside
170    /// containers.
171    /// When true, the controller's instance-type selector is constrained
172    /// to a vetted nested-virt-capable allowlist.
173    #[serde(skip_serializing_if = "Option::is_none")]
174    pub nested_virtualization: Option<bool>,
175}
176
177/// ComputeCluster resource for running long-running container workloads.
178///
179/// A ComputeCluster provides the setup-owned machine boundary for containers.
180/// Alien may manage the worker fleet inside that boundary when setup grants
181/// `compute-cluster/management`.
182///
183/// ## Architecture
184///
185/// - **Setup** creates cloud resources: ASGs/MIGs/VMSSs, IAM roles, security groups
186/// - **Alien** manages allowed fleet operations: machine count and runtime
187///   machine image rollout
188/// - A node agent runs on each machine from the selected runtime image channel
189///
190/// ## Example
191///
192/// ```rust
193/// use alien_core::{CapacityGroup, ComputeCluster, MachineProfile};
194///
195/// let cluster = ComputeCluster::new("compute".to_string())
196///     .capacity_group(CapacityGroup {
197///         group_id: "general".to_string(),
198///         instance_type: None,
199///         profile: Some(MachineProfile {
200///             cpu: "4.0".to_string(),
201///             memory_bytes: 16 * 1024 * 1024 * 1024,
202///             ephemeral_storage_bytes: 20 * 1024 * 1024 * 1024,
203///             architecture: None,
204///             gpu: None,
205///         }),
206///         min_size: 1,
207///         max_size: 5,
208///         scale_policy: None,
209///         nested_virtualization: None,
210///     })
211///     .build();
212/// ```
213#[derive(Debug, Clone, PartialEq, Serialize, Deserialize, Builder)]
214#[cfg_attr(feature = "openapi", derive(utoipa::ToSchema))]
215#[serde(rename_all = "camelCase", deny_unknown_fields)]
216#[builder(start_fn = new)]
217pub struct ComputeCluster {
218    /// Unique identifier for the container cluster.
219    /// Must contain only alphanumeric characters, hyphens, and underscores.
220    #[builder(start_fn)]
221    pub id: String,
222
223    /// Capacity groups defining the machine pools for this cluster.
224    /// Each group becomes a separate ASG/MIG/VMSS.
225    #[builder(field)]
226    pub capacity_groups: Vec<CapacityGroup>,
227
228    /// Container CIDR block for internal container networking.
229    /// Auto-generated as "10.244.0.0/16" if not specified.
230    /// Each machine gets a /24 subnet from this range.
231    #[serde(skip_serializing_if = "Option::is_none")]
232    pub container_cidr: Option<String>,
233}
234
235impl ComputeCluster {
236    /// The resource type identifier for ComputeCluster
237    pub const RESOURCE_TYPE: ResourceType = ResourceType::from_static("compute-cluster");
238
239    /// Returns the cluster's unique identifier.
240    pub fn id(&self) -> &str {
241        &self.id
242    }
243
244    /// Returns the container CIDR, defaulting to "10.244.0.0/16" if not specified.
245    pub fn container_cidr(&self) -> &str {
246        self.container_cidr.as_deref().unwrap_or("10.244.0.0/16")
247    }
248}
249
250impl<S: compute_cluster_builder::State> ComputeClusterBuilder<S> {
251    /// Adds a capacity group to the cluster.
252    pub fn capacity_group(mut self, group: CapacityGroup) -> Self {
253        self.capacity_groups.push(group);
254        self
255    }
256}
257
258/// Status of a single capacity group within a ComputeCluster.
259#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
260#[cfg_attr(feature = "openapi", derive(utoipa::ToSchema))]
261#[serde(rename_all = "camelCase")]
262pub struct CapacityGroupStatus {
263    /// Capacity group ID
264    pub group_id: String,
265    /// Current number of machines
266    pub current_machines: u32,
267    /// Desired number of machines (from the managed container capacity plan)
268    pub desired_machines: u32,
269    /// Instance type being used
270    pub instance_type: String,
271}
272
273/// Outputs generated by a successfully provisioned ComputeCluster.
274#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
275#[cfg_attr(feature = "openapi", derive(utoipa::ToSchema))]
276#[serde(rename_all = "camelCase")]
277pub struct ComputeClusterOutputs {
278    /// Managed container cluster ID (workspace/project/deployment/resourceid format)
279    pub cluster_id: String,
280    /// Whether the managed container cluster is ready
281    pub horizon_ready: bool,
282    /// Status of each capacity group
283    pub capacity_group_statuses: Vec<CapacityGroupStatus>,
284    /// Total number of machines across all capacity groups
285    pub total_machines: u32,
286}
287
288impl ResourceOutputsDefinition for ComputeClusterOutputs {
289    fn get_resource_type(&self) -> ResourceType {
290        ComputeCluster::RESOURCE_TYPE.clone()
291    }
292
293    fn as_any(&self) -> &dyn Any {
294        self
295    }
296
297    fn box_clone(&self) -> Box<dyn ResourceOutputsDefinition> {
298        Box::new(self.clone())
299    }
300
301    fn outputs_eq(&self, other: &dyn ResourceOutputsDefinition) -> bool {
302        other.as_any().downcast_ref::<ComputeClusterOutputs>() == Some(self)
303    }
304
305    fn to_json_value(&self) -> serde_json::Result<serde_json::Value> {
306        serde_json::to_value(self)
307    }
308}
309
310impl ResourceDefinition for ComputeCluster {
311    fn get_resource_type(&self) -> ResourceType {
312        Self::RESOURCE_TYPE
313    }
314
315    fn id(&self) -> &str {
316        &self.id
317    }
318
319    fn get_dependencies(&self) -> Vec<ResourceRef> {
320        // ComputeCluster has no static dependencies.
321        // Network dependency is platform-specific:
322        // - AWS/GCP/Azure: Added by ComputeClusterMutation
323        // - Local/Kubernetes: Not needed (Docker/K8s handles networking)
324        // Platform controllers use require_dependency() at runtime to access Network state.
325        Vec::new()
326    }
327
328    fn validate_update(&self, new_config: &dyn ResourceDefinition) -> Result<()> {
329        let new_cluster = new_config
330            .as_any()
331            .downcast_ref::<ComputeCluster>()
332            .ok_or_else(|| {
333                AlienError::new(ErrorData::UnexpectedResourceType {
334                    resource_id: self.id.clone(),
335                    expected: Self::RESOURCE_TYPE,
336                    actual: new_config.get_resource_type(),
337                })
338            })?;
339
340        if self.id != new_cluster.id {
341            return Err(AlienError::new(ErrorData::InvalidResourceUpdate {
342                resource_id: self.id.clone(),
343                reason: "the 'id' field is immutable".to_string(),
344            }));
345        }
346
347        // Container CIDR is immutable once set
348        if self.container_cidr.is_some()
349            && new_cluster.container_cidr.is_some()
350            && self.container_cidr != new_cluster.container_cidr
351        {
352            return Err(AlienError::new(ErrorData::InvalidResourceUpdate {
353                resource_id: self.id.clone(),
354                reason: "the 'containerCidr' field is immutable once set".to_string(),
355            }));
356        }
357
358        // Validate capacity groups
359        for new_group in &new_cluster.capacity_groups {
360            if let Some(existing_group) = self
361                .capacity_groups
362                .iter()
363                .find(|g| g.group_id == new_group.group_id)
364            {
365                // Instance type is immutable for existing groups
366                if existing_group.instance_type.is_some()
367                    && new_group.instance_type.is_some()
368                    && existing_group.instance_type != new_group.instance_type
369                {
370                    return Err(AlienError::new(ErrorData::InvalidResourceUpdate {
371                        resource_id: self.id.clone(),
372                        reason: format!(
373                            "instance type for capacity group '{}' is immutable",
374                            new_group.group_id
375                        ),
376                    }));
377                }
378            }
379        }
380
381        Ok(())
382    }
383
384    fn as_any(&self) -> &dyn Any {
385        self
386    }
387
388    fn as_any_mut(&mut self) -> &mut dyn Any {
389        self
390    }
391
392    fn box_clone(&self) -> Box<dyn ResourceDefinition> {
393        Box::new(self.clone())
394    }
395
396    fn resource_eq(&self, other: &dyn ResourceDefinition) -> bool {
397        other.as_any().downcast_ref::<ComputeCluster>() == Some(self)
398    }
399
400    fn to_json_value(&self) -> serde_json::Result<serde_json::Value> {
401        serde_json::to_value(self)
402    }
403}
404
405#[cfg(test)]
406mod tests {
407    use super::*;
408
409    #[test]
410    fn test_compute_cluster_creation() {
411        let cluster = ComputeCluster::new("compute".to_string())
412            .capacity_group(CapacityGroup {
413                group_id: "general".to_string(),
414                instance_type: Some("m7g.xlarge".to_string()),
415                profile: None,
416                min_size: 1,
417                max_size: 5,
418                scale_policy: None,
419                nested_virtualization: None,
420            })
421            .build();
422
423        assert_eq!(cluster.id(), "compute");
424        assert_eq!(cluster.capacity_groups.len(), 1);
425        assert_eq!(cluster.capacity_groups[0].group_id, "general");
426        assert_eq!(cluster.container_cidr(), "10.244.0.0/16");
427    }
428
429    #[test]
430    fn test_compute_cluster_multiple_capacity_groups() {
431        let cluster = ComputeCluster::new("multi-pool".to_string())
432            .capacity_group(CapacityGroup {
433                group_id: "general".to_string(),
434                instance_type: Some("m7g.xlarge".to_string()),
435                profile: None,
436                min_size: 1,
437                max_size: 3,
438                scale_policy: None,
439                nested_virtualization: None,
440            })
441            .capacity_group(CapacityGroup {
442                group_id: "gpu".to_string(),
443                instance_type: Some("g5.xlarge".to_string()),
444                profile: Some(MachineProfile {
445                    cpu: "4.0".to_string(),
446                    memory_bytes: 17179869184,             // 16 GiB
447                    ephemeral_storage_bytes: 214748364800, // 200 GiB
448                    architecture: None,
449                    gpu: Some(GpuSpec {
450                        gpu_type: "nvidia-a10g".to_string(),
451                        count: 1,
452                    }),
453                }),
454                min_size: 0,
455                max_size: 2,
456                scale_policy: None,
457                nested_virtualization: None,
458            })
459            .build();
460
461        assert_eq!(cluster.capacity_groups.len(), 2);
462        assert_eq!(cluster.capacity_groups[0].group_id, "general");
463        assert_eq!(cluster.capacity_groups[1].group_id, "gpu");
464        assert!(cluster.capacity_groups[1]
465            .profile
466            .as_ref()
467            .unwrap()
468            .gpu
469            .is_some());
470    }
471
472    #[test]
473    fn test_compute_cluster_custom_cidr() {
474        let cluster = ComputeCluster::new("custom-net".to_string())
475            .container_cidr("172.30.0.0/16".to_string())
476            .capacity_group(CapacityGroup {
477                group_id: "general".to_string(),
478                instance_type: None,
479                profile: None,
480                min_size: 1,
481                max_size: 5,
482                scale_policy: None,
483                nested_virtualization: None,
484            })
485            .build();
486
487        assert_eq!(cluster.container_cidr(), "172.30.0.0/16");
488    }
489
490    #[test]
491    fn test_compute_cluster_validate_update_immutable_id() {
492        let cluster1 = ComputeCluster::new("cluster-1".to_string())
493            .capacity_group(CapacityGroup {
494                group_id: "general".to_string(),
495                instance_type: None,
496                profile: None,
497                min_size: 1,
498                max_size: 5,
499                scale_policy: None,
500                nested_virtualization: None,
501            })
502            .build();
503
504        let cluster2 = ComputeCluster::new("cluster-2".to_string())
505            .capacity_group(CapacityGroup {
506                group_id: "general".to_string(),
507                instance_type: None,
508                profile: None,
509                min_size: 1,
510                max_size: 5,
511                scale_policy: None,
512                nested_virtualization: None,
513            })
514            .build();
515
516        let result = cluster1.validate_update(&cluster2);
517        assert!(result.is_err());
518    }
519
520    #[test]
521    fn test_compute_cluster_validate_update_scale_change() {
522        let cluster1 = ComputeCluster::new("compute".to_string())
523            .capacity_group(CapacityGroup {
524                group_id: "general".to_string(),
525                instance_type: Some("m7g.xlarge".to_string()),
526                profile: None,
527                min_size: 1,
528                max_size: 5,
529                scale_policy: None,
530                nested_virtualization: None,
531            })
532            .build();
533
534        let cluster2 = ComputeCluster::new("compute".to_string())
535            .capacity_group(CapacityGroup {
536                group_id: "general".to_string(),
537                instance_type: Some("m7g.xlarge".to_string()),
538                profile: None,
539                min_size: 2,
540                max_size: 10,
541                scale_policy: None,
542                nested_virtualization: None,
543            })
544            .build();
545
546        // Scale changes should be allowed
547        let result = cluster1.validate_update(&cluster2);
548        assert!(result.is_ok());
549    }
550
551    #[test]
552    fn test_compute_cluster_serialization() {
553        let cluster = ComputeCluster::new("test-cluster".to_string())
554            .capacity_group(CapacityGroup {
555                group_id: "general".to_string(),
556                instance_type: Some("m7g.xlarge".to_string()),
557                profile: None,
558                min_size: 1,
559                max_size: 5,
560                scale_policy: None,
561                nested_virtualization: None,
562            })
563            .build();
564
565        let json = serde_json::to_string(&cluster).unwrap();
566        let deserialized: ComputeCluster = serde_json::from_str(&json).unwrap();
567        assert_eq!(cluster, deserialized);
568    }
569}