Skip to main content

alien_core/
instance_catalog.rs

1//! Instance type catalog and selection algorithm for cloud compute infrastructure.
2//!
3//! This module provides:
4//! - A static catalog of known instance types across AWS, GCP, and Azure
5//! - Resource quantity parsing (CPU strings, Kubernetes-style memory/storage quantities)
6//! - An algorithm to select the optimal instance type for a given workload
7//!
8//! The catalog is the single source of truth for instance type specifications.
9//! It is used by the preflights system to automatically populate `CapacityGroup.instance_type`
10//! and `CapacityGroup.profile` based on the containers in a stack.
11
12use crate::{GpuSpec, MachineProfile, Platform};
13use serde::{Deserialize, Serialize};
14
15// ---------------------------------------------------------------------------
16// Resource quantity parsing
17// ---------------------------------------------------------------------------
18
19/// Parse a CPU quantity string to f64.
20///
21/// Accepts plain numbers ("1", "0.5", "2.0") and millicore suffixes ("500m" = 0.5).
22pub fn parse_cpu(s: &str) -> Result<f64, String> {
23    let s = s.trim();
24    if s.is_empty() {
25        return Err("empty CPU string".to_string());
26    }
27
28    if let Some(millis) = s.strip_suffix('m') {
29        let v: f64 = millis
30            .parse()
31            .map_err(|_| format!("invalid CPU millicore value: '{s}'"))?;
32        Ok(v / 1000.0)
33    } else {
34        s.parse().map_err(|_| format!("invalid CPU value: '{s}'"))
35    }
36}
37
38/// Parse a memory or storage quantity string to bytes.
39///
40/// Supports Kubernetes-style binary suffixes (Ki, Mi, Gi, Ti) and
41/// decimal suffixes (k, M, G, T). Plain numbers are interpreted as bytes.
42pub fn parse_memory_bytes(s: &str) -> Result<u64, String> {
43    let s = s.trim();
44    if s.is_empty() {
45        return Err("empty memory/storage string".to_string());
46    }
47
48    // Binary suffixes (powers of 1024)
49    if let Some(num) = s.strip_suffix("Ti") {
50        let v: f64 = num
51            .parse()
52            .map_err(|_| format!("invalid memory value: '{s}'"))?;
53        return Ok((v * 1024.0 * 1024.0 * 1024.0 * 1024.0) as u64);
54    }
55    if let Some(num) = s.strip_suffix("Gi") {
56        let v: f64 = num
57            .parse()
58            .map_err(|_| format!("invalid memory value: '{s}'"))?;
59        return Ok((v * 1024.0 * 1024.0 * 1024.0) as u64);
60    }
61    if let Some(num) = s.strip_suffix("Mi") {
62        let v: f64 = num
63            .parse()
64            .map_err(|_| format!("invalid memory value: '{s}'"))?;
65        return Ok((v * 1024.0 * 1024.0) as u64);
66    }
67    if let Some(num) = s.strip_suffix("Ki") {
68        let v: f64 = num
69            .parse()
70            .map_err(|_| format!("invalid memory value: '{s}'"))?;
71        return Ok((v * 1024.0) as u64);
72    }
73
74    // Decimal suffixes (powers of 1000)
75    if let Some(num) = s.strip_suffix('T') {
76        let v: f64 = num
77            .parse()
78            .map_err(|_| format!("invalid memory value: '{s}'"))?;
79        return Ok((v * 1_000_000_000_000.0) as u64);
80    }
81    if let Some(num) = s.strip_suffix('G') {
82        let v: f64 = num
83            .parse()
84            .map_err(|_| format!("invalid memory value: '{s}'"))?;
85        return Ok((v * 1_000_000_000.0) as u64);
86    }
87    if let Some(num) = s.strip_suffix('M') {
88        let v: f64 = num
89            .parse()
90            .map_err(|_| format!("invalid memory value: '{s}'"))?;
91        return Ok((v * 1_000_000.0) as u64);
92    }
93    if let Some(num) = s.strip_suffix('k') {
94        let v: f64 = num
95            .parse()
96            .map_err(|_| format!("invalid memory value: '{s}'"))?;
97        return Ok((v * 1000.0) as u64);
98    }
99
100    // Plain bytes
101    s.parse()
102        .map_err(|_| format!("invalid memory value: '{s}'"))
103}
104
105// ---------------------------------------------------------------------------
106// Instance type catalog
107// ---------------------------------------------------------------------------
108
109/// Instance family classification.
110#[derive(Debug, Clone, Copy, PartialEq, Eq)]
111pub enum InstanceFamily {
112    Burstable,
113    GeneralPurpose,
114    ComputeOptimized,
115    MemoryOptimized,
116    StorageOptimized,
117    GpuCompute,
118}
119
120/// CPU architecture.
121#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
122#[cfg_attr(feature = "openapi", derive(utoipa::ToSchema))]
123#[serde(rename_all = "snake_case")]
124pub enum Architecture {
125    Arm64,
126    X86_64,
127}
128
129/// Static GPU specification for catalog entries (no heap allocation).
130#[derive(Debug, Clone, Copy, PartialEq, Eq)]
131pub struct CatalogGpu {
132    pub gpu_type: &'static str,
133    pub count: u32,
134}
135
136/// A known instance type with its hardware specifications.
137///
138/// All fields are compile-time constants. The catalog is a flat array of these.
139#[derive(Debug, Clone)]
140pub struct InstanceTypeSpec {
141    pub name: &'static str,
142    pub platform: Platform,
143    pub family: InstanceFamily,
144    pub architecture: Architecture,
145    /// vCPU count (hardware total)
146    pub vcpu: u32,
147    /// Memory in bytes (hardware total)
148    pub memory_bytes: u64,
149    /// Ephemeral storage in bytes (hardware total, NVMe for storage-optimized)
150    pub ephemeral_storage_bytes: u64,
151    /// GPU specification (for GPU instances)
152    pub gpu: Option<CatalogGpu>,
153}
154
155impl InstanceTypeSpec {
156    /// Whether this instance type supports
157    /// `CpuOptions.NestedVirtualization=enabled` on AWS launch.
158    ///
159    /// Per AWS docs (`aws ec2 create-launch-template help`), nested
160    /// virtualization is only supported on 8th-generation Intel instance
161    /// types: c8i, m8i, r8i, and their `-flex` variants. We classify by
162    /// family-name prefix rather than a per-row bool so the existing 70+
163    /// catalog rows don't need an extra field.
164    pub fn is_nested_virt_capable(&self) -> bool {
165        if self.platform != Platform::Aws {
166            // GCP/Azure equivalents would need their own family lists.
167            // Today nested virt is wired through only for AWS.
168            return false;
169        }
170        let name = self.name;
171        name.starts_with("m8i.")
172            || name.starts_with("c8i.")
173            || name.starts_with("r8i.")
174            || name.starts_with("m8i-flex.")
175            || name.starts_with("c8i-flex.")
176            || name.starts_with("r8i-flex.")
177    }
178
179    /// Convert this catalog entry into a `MachineProfile` for use in `CapacityGroup`.
180    pub fn to_machine_profile(&self) -> MachineProfile {
181        MachineProfile {
182            cpu: format!("{}.0", self.vcpu),
183            memory_bytes: self.memory_bytes,
184            ephemeral_storage_bytes: self.ephemeral_storage_bytes,
185            architecture: Some(self.architecture),
186            gpu: self.gpu.map(|g| GpuSpec {
187                gpu_type: g.gpu_type.to_string(),
188                count: g.count,
189            }),
190        }
191    }
192}
193
194// Helpers for readable byte constants
195const KI: u64 = 1024;
196const MI: u64 = KI * 1024;
197const GI: u64 = MI * 1024;
198
199/// The complete instance type catalog.
200///
201/// This is the single source of truth for instance type specifications.
202/// Update this array when adding support for new instance types.
203///
204/// NOTE: Ephemeral storage values for non-NVMe instances are conservative defaults
205/// (EBS-backed root volumes). Storage-optimized instances list their NVMe capacity.
206static CATALOG: &[InstanceTypeSpec] = &[
207    // =========================================================================
208    // AWS — ARM (Graviton) preferred for cost efficiency
209    // =========================================================================
210
211    // Burstable (t4g — ARM Graviton2)
212    InstanceTypeSpec {
213        name: "t4g.micro",
214        platform: Platform::Aws,
215        family: InstanceFamily::Burstable,
216        architecture: Architecture::Arm64,
217        vcpu: 2,
218        memory_bytes: 1 * GI,
219        ephemeral_storage_bytes: 20 * GI,
220        gpu: None,
221    },
222    InstanceTypeSpec {
223        name: "t4g.small",
224        platform: Platform::Aws,
225        family: InstanceFamily::Burstable,
226        architecture: Architecture::Arm64,
227        vcpu: 2,
228        memory_bytes: 2 * GI,
229        ephemeral_storage_bytes: 20 * GI,
230        gpu: None,
231    },
232    InstanceTypeSpec {
233        name: "t4g.medium",
234        platform: Platform::Aws,
235        family: InstanceFamily::Burstable,
236        architecture: Architecture::Arm64,
237        vcpu: 2,
238        memory_bytes: 4 * GI,
239        ephemeral_storage_bytes: 20 * GI,
240        gpu: None,
241    },
242    InstanceTypeSpec {
243        name: "t4g.large",
244        platform: Platform::Aws,
245        family: InstanceFamily::Burstable,
246        architecture: Architecture::Arm64,
247        vcpu: 2,
248        memory_bytes: 8 * GI,
249        ephemeral_storage_bytes: 20 * GI,
250        gpu: None,
251    },
252    InstanceTypeSpec {
253        name: "t4g.xlarge",
254        platform: Platform::Aws,
255        family: InstanceFamily::Burstable,
256        architecture: Architecture::Arm64,
257        vcpu: 4,
258        memory_bytes: 16 * GI,
259        ephemeral_storage_bytes: 20 * GI,
260        gpu: None,
261    },
262    // General Purpose (m7g — ARM Graviton3, up to 2xlarge / 8 vCPU)
263    InstanceTypeSpec {
264        name: "m7g.medium",
265        platform: Platform::Aws,
266        family: InstanceFamily::GeneralPurpose,
267        architecture: Architecture::Arm64,
268        vcpu: 1,
269        memory_bytes: 4 * GI,
270        ephemeral_storage_bytes: 20 * GI,
271        gpu: None,
272    },
273    InstanceTypeSpec {
274        name: "m7g.large",
275        platform: Platform::Aws,
276        family: InstanceFamily::GeneralPurpose,
277        architecture: Architecture::Arm64,
278        vcpu: 2,
279        memory_bytes: 8 * GI,
280        ephemeral_storage_bytes: 20 * GI,
281        gpu: None,
282    },
283    // 8th-gen Intel AWS families accept
284    // `CpuOptions.NestedVirtualization=enabled`. The catalog filter in
285    // `select_instance_type` includes these entries only when the
286    // workload requests nested virt, so ordinary workloads continue to
287    // pick the cost-efficient Graviton (m7g) above. The pairwise
288    // interleave keeps the per-family vCPU-non-decreasing invariant
289    // (see `test_catalog_instance_types_sorted_by_vcpu_within_family`).
290    InstanceTypeSpec {
291        name: "m8i.large",
292        platform: Platform::Aws,
293        family: InstanceFamily::GeneralPurpose,
294        architecture: Architecture::X86_64,
295        vcpu: 2,
296        memory_bytes: 8 * GI,
297        ephemeral_storage_bytes: 20 * GI,
298        gpu: None,
299    },
300    InstanceTypeSpec {
301        name: "m7g.xlarge",
302        platform: Platform::Aws,
303        family: InstanceFamily::GeneralPurpose,
304        architecture: Architecture::Arm64,
305        vcpu: 4,
306        memory_bytes: 16 * GI,
307        ephemeral_storage_bytes: 20 * GI,
308        gpu: None,
309    },
310    InstanceTypeSpec {
311        name: "m8i.xlarge",
312        platform: Platform::Aws,
313        family: InstanceFamily::GeneralPurpose,
314        architecture: Architecture::X86_64,
315        vcpu: 4,
316        memory_bytes: 16 * GI,
317        ephemeral_storage_bytes: 20 * GI,
318        gpu: None,
319    },
320    InstanceTypeSpec {
321        name: "m7g.2xlarge",
322        platform: Platform::Aws,
323        family: InstanceFamily::GeneralPurpose,
324        architecture: Architecture::Arm64,
325        vcpu: 8,
326        memory_bytes: 32 * GI,
327        ephemeral_storage_bytes: 20 * GI,
328        gpu: None,
329    },
330    InstanceTypeSpec {
331        name: "m8i.2xlarge",
332        platform: Platform::Aws,
333        family: InstanceFamily::GeneralPurpose,
334        architecture: Architecture::X86_64,
335        vcpu: 8,
336        memory_bytes: 32 * GI,
337        ephemeral_storage_bytes: 20 * GI,
338        gpu: None,
339    },
340    InstanceTypeSpec {
341        name: "m7g.4xlarge",
342        platform: Platform::Aws,
343        family: InstanceFamily::GeneralPurpose,
344        architecture: Architecture::Arm64,
345        vcpu: 16,
346        memory_bytes: 64 * GI,
347        ephemeral_storage_bytes: 20 * GI,
348        gpu: None,
349    },
350    InstanceTypeSpec {
351        name: "m8i.4xlarge",
352        platform: Platform::Aws,
353        family: InstanceFamily::GeneralPurpose,
354        architecture: Architecture::X86_64,
355        vcpu: 16,
356        memory_bytes: 64 * GI,
357        ephemeral_storage_bytes: 20 * GI,
358        gpu: None,
359    },
360    // Compute Optimized (c7g — ARM Graviton3, up to 2xlarge / 8 vCPU)
361    InstanceTypeSpec {
362        name: "c7g.medium",
363        platform: Platform::Aws,
364        family: InstanceFamily::ComputeOptimized,
365        architecture: Architecture::Arm64,
366        vcpu: 1,
367        memory_bytes: 2 * GI,
368        ephemeral_storage_bytes: 20 * GI,
369        gpu: None,
370    },
371    InstanceTypeSpec {
372        name: "c7g.large",
373        platform: Platform::Aws,
374        family: InstanceFamily::ComputeOptimized,
375        architecture: Architecture::Arm64,
376        vcpu: 2,
377        memory_bytes: 4 * GI,
378        ephemeral_storage_bytes: 20 * GI,
379        gpu: None,
380    },
381    InstanceTypeSpec {
382        name: "c8i.large",
383        platform: Platform::Aws,
384        family: InstanceFamily::ComputeOptimized,
385        architecture: Architecture::X86_64,
386        vcpu: 2,
387        memory_bytes: 4 * GI,
388        ephemeral_storage_bytes: 20 * GI,
389        gpu: None,
390    },
391    InstanceTypeSpec {
392        name: "c7g.xlarge",
393        platform: Platform::Aws,
394        family: InstanceFamily::ComputeOptimized,
395        architecture: Architecture::Arm64,
396        vcpu: 4,
397        memory_bytes: 8 * GI,
398        ephemeral_storage_bytes: 20 * GI,
399        gpu: None,
400    },
401    InstanceTypeSpec {
402        name: "c8i.xlarge",
403        platform: Platform::Aws,
404        family: InstanceFamily::ComputeOptimized,
405        architecture: Architecture::X86_64,
406        vcpu: 4,
407        memory_bytes: 8 * GI,
408        ephemeral_storage_bytes: 20 * GI,
409        gpu: None,
410    },
411    InstanceTypeSpec {
412        name: "c7g.2xlarge",
413        platform: Platform::Aws,
414        family: InstanceFamily::ComputeOptimized,
415        architecture: Architecture::Arm64,
416        vcpu: 8,
417        memory_bytes: 16 * GI,
418        ephemeral_storage_bytes: 20 * GI,
419        gpu: None,
420    },
421    InstanceTypeSpec {
422        name: "c8i.2xlarge",
423        platform: Platform::Aws,
424        family: InstanceFamily::ComputeOptimized,
425        architecture: Architecture::X86_64,
426        vcpu: 8,
427        memory_bytes: 16 * GI,
428        ephemeral_storage_bytes: 20 * GI,
429        gpu: None,
430    },
431    InstanceTypeSpec {
432        name: "c7g.4xlarge",
433        platform: Platform::Aws,
434        family: InstanceFamily::ComputeOptimized,
435        architecture: Architecture::Arm64,
436        vcpu: 16,
437        memory_bytes: 32 * GI,
438        ephemeral_storage_bytes: 20 * GI,
439        gpu: None,
440    },
441    InstanceTypeSpec {
442        name: "c8i.4xlarge",
443        platform: Platform::Aws,
444        family: InstanceFamily::ComputeOptimized,
445        architecture: Architecture::X86_64,
446        vcpu: 16,
447        memory_bytes: 32 * GI,
448        ephemeral_storage_bytes: 20 * GI,
449        gpu: None,
450    },
451    // Memory Optimized (r7g — ARM Graviton3, up to 2xlarge / 8 vCPU)
452    InstanceTypeSpec {
453        name: "r7g.medium",
454        platform: Platform::Aws,
455        family: InstanceFamily::MemoryOptimized,
456        architecture: Architecture::Arm64,
457        vcpu: 1,
458        memory_bytes: 8 * GI,
459        ephemeral_storage_bytes: 20 * GI,
460        gpu: None,
461    },
462    InstanceTypeSpec {
463        name: "r7g.large",
464        platform: Platform::Aws,
465        family: InstanceFamily::MemoryOptimized,
466        architecture: Architecture::Arm64,
467        vcpu: 2,
468        memory_bytes: 16 * GI,
469        ephemeral_storage_bytes: 20 * GI,
470        gpu: None,
471    },
472    InstanceTypeSpec {
473        name: "r7g.xlarge",
474        platform: Platform::Aws,
475        family: InstanceFamily::MemoryOptimized,
476        architecture: Architecture::Arm64,
477        vcpu: 4,
478        memory_bytes: 32 * GI,
479        ephemeral_storage_bytes: 20 * GI,
480        gpu: None,
481    },
482    InstanceTypeSpec {
483        name: "r7g.2xlarge",
484        platform: Platform::Aws,
485        family: InstanceFamily::MemoryOptimized,
486        architecture: Architecture::Arm64,
487        vcpu: 8,
488        memory_bytes: 64 * GI,
489        ephemeral_storage_bytes: 20 * GI,
490        gpu: None,
491    },
492    InstanceTypeSpec {
493        name: "r7g.4xlarge",
494        platform: Platform::Aws,
495        family: InstanceFamily::MemoryOptimized,
496        architecture: Architecture::Arm64,
497        vcpu: 16,
498        memory_bytes: 128 * GI,
499        ephemeral_storage_bytes: 20 * GI,
500        gpu: None,
501    },
502    // Storage Optimized (i4i — x86_64, NVMe)
503    InstanceTypeSpec {
504        name: "i4i.xlarge",
505        platform: Platform::Aws,
506        family: InstanceFamily::StorageOptimized,
507        architecture: Architecture::X86_64,
508        vcpu: 4,
509        memory_bytes: 32 * GI,
510        ephemeral_storage_bytes: 937 * GI,
511        gpu: None,
512    },
513    InstanceTypeSpec {
514        name: "i4i.2xlarge",
515        platform: Platform::Aws,
516        family: InstanceFamily::StorageOptimized,
517        architecture: Architecture::X86_64,
518        vcpu: 8,
519        memory_bytes: 64 * GI,
520        ephemeral_storage_bytes: 1875 * GI,
521        gpu: None,
522    },
523    InstanceTypeSpec {
524        name: "i4i.4xlarge",
525        platform: Platform::Aws,
526        family: InstanceFamily::StorageOptimized,
527        architecture: Architecture::X86_64,
528        vcpu: 16,
529        memory_bytes: 128 * GI,
530        ephemeral_storage_bytes: 3750 * GI,
531        gpu: None,
532    },
533    InstanceTypeSpec {
534        name: "i4i.8xlarge",
535        platform: Platform::Aws,
536        family: InstanceFamily::StorageOptimized,
537        architecture: Architecture::X86_64,
538        vcpu: 32,
539        memory_bytes: 256 * GI,
540        ephemeral_storage_bytes: 7500 * GI,
541        gpu: None,
542    },
543    // GPU — NVIDIA T4 (g5 — x86_64)
544    InstanceTypeSpec {
545        name: "g5.xlarge",
546        platform: Platform::Aws,
547        family: InstanceFamily::GpuCompute,
548        architecture: Architecture::X86_64,
549        vcpu: 4,
550        memory_bytes: 16 * GI,
551        ephemeral_storage_bytes: 250 * GI,
552        gpu: Some(CatalogGpu {
553            gpu_type: "nvidia-t4",
554            count: 1,
555        }),
556    },
557    InstanceTypeSpec {
558        name: "g5.2xlarge",
559        platform: Platform::Aws,
560        family: InstanceFamily::GpuCompute,
561        architecture: Architecture::X86_64,
562        vcpu: 8,
563        memory_bytes: 32 * GI,
564        ephemeral_storage_bytes: 450 * GI,
565        gpu: Some(CatalogGpu {
566            gpu_type: "nvidia-t4",
567            count: 1,
568        }),
569    },
570    // GPU — NVIDIA A100 (p4d — x86_64)
571    InstanceTypeSpec {
572        name: "p4d.24xlarge",
573        platform: Platform::Aws,
574        family: InstanceFamily::GpuCompute,
575        architecture: Architecture::X86_64,
576        vcpu: 96,
577        memory_bytes: 1152 * GI,
578        ephemeral_storage_bytes: 8000 * GI,
579        gpu: Some(CatalogGpu {
580            gpu_type: "nvidia-a100",
581            count: 8,
582        }),
583    },
584    // GPU — NVIDIA H100 (p5 — x86_64)
585    InstanceTypeSpec {
586        name: "p5.48xlarge",
587        platform: Platform::Aws,
588        family: InstanceFamily::GpuCompute,
589        architecture: Architecture::X86_64,
590        vcpu: 192,
591        memory_bytes: 2048 * GI,
592        ephemeral_storage_bytes: 8000 * GI,
593        gpu: Some(CatalogGpu {
594            gpu_type: "nvidia-h100",
595            count: 8,
596        }),
597    },
598    // =========================================================================
599    // GCP
600    // =========================================================================
601
602    // Burstable (e2)
603    InstanceTypeSpec {
604        name: "e2-micro",
605        platform: Platform::Gcp,
606        family: InstanceFamily::Burstable,
607        architecture: Architecture::X86_64,
608        vcpu: 2,
609        memory_bytes: 1 * GI,
610        ephemeral_storage_bytes: 20 * GI,
611        gpu: None,
612    },
613    InstanceTypeSpec {
614        name: "e2-small",
615        platform: Platform::Gcp,
616        family: InstanceFamily::Burstable,
617        architecture: Architecture::X86_64,
618        vcpu: 2,
619        memory_bytes: 2 * GI,
620        ephemeral_storage_bytes: 20 * GI,
621        gpu: None,
622    },
623    InstanceTypeSpec {
624        name: "e2-medium",
625        platform: Platform::Gcp,
626        family: InstanceFamily::Burstable,
627        architecture: Architecture::X86_64,
628        vcpu: 2,
629        memory_bytes: 4 * GI,
630        ephemeral_storage_bytes: 20 * GI,
631        gpu: None,
632    },
633    // General Purpose (n2-standard, up to 16 vCPU)
634    InstanceTypeSpec {
635        name: "n2-standard-2",
636        platform: Platform::Gcp,
637        family: InstanceFamily::GeneralPurpose,
638        architecture: Architecture::X86_64,
639        vcpu: 2,
640        memory_bytes: 8 * GI,
641        ephemeral_storage_bytes: 20 * GI,
642        gpu: None,
643    },
644    InstanceTypeSpec {
645        name: "n2-standard-4",
646        platform: Platform::Gcp,
647        family: InstanceFamily::GeneralPurpose,
648        architecture: Architecture::X86_64,
649        vcpu: 4,
650        memory_bytes: 16 * GI,
651        ephemeral_storage_bytes: 20 * GI,
652        gpu: None,
653    },
654    InstanceTypeSpec {
655        name: "n2-standard-8",
656        platform: Platform::Gcp,
657        family: InstanceFamily::GeneralPurpose,
658        architecture: Architecture::X86_64,
659        vcpu: 8,
660        memory_bytes: 32 * GI,
661        ephemeral_storage_bytes: 20 * GI,
662        gpu: None,
663    },
664    InstanceTypeSpec {
665        name: "n2-standard-16",
666        platform: Platform::Gcp,
667        family: InstanceFamily::GeneralPurpose,
668        architecture: Architecture::X86_64,
669        vcpu: 16,
670        memory_bytes: 64 * GI,
671        ephemeral_storage_bytes: 20 * GI,
672        gpu: None,
673    },
674    // Compute Optimized (c3-standard, up to 8 vCPU)
675    InstanceTypeSpec {
676        name: "c3-standard-4",
677        platform: Platform::Gcp,
678        family: InstanceFamily::ComputeOptimized,
679        architecture: Architecture::X86_64,
680        vcpu: 4,
681        memory_bytes: 8 * GI,
682        ephemeral_storage_bytes: 20 * GI,
683        gpu: None,
684    },
685    InstanceTypeSpec {
686        name: "c3-standard-8",
687        platform: Platform::Gcp,
688        family: InstanceFamily::ComputeOptimized,
689        architecture: Architecture::X86_64,
690        vcpu: 8,
691        memory_bytes: 16 * GI,
692        ephemeral_storage_bytes: 20 * GI,
693        gpu: None,
694    },
695    // Memory Optimized (n2-highmem, up to 8 vCPU)
696    InstanceTypeSpec {
697        name: "n2-highmem-2",
698        platform: Platform::Gcp,
699        family: InstanceFamily::MemoryOptimized,
700        architecture: Architecture::X86_64,
701        vcpu: 2,
702        memory_bytes: 16 * GI,
703        ephemeral_storage_bytes: 20 * GI,
704        gpu: None,
705    },
706    InstanceTypeSpec {
707        name: "n2-highmem-4",
708        platform: Platform::Gcp,
709        family: InstanceFamily::MemoryOptimized,
710        architecture: Architecture::X86_64,
711        vcpu: 4,
712        memory_bytes: 32 * GI,
713        ephemeral_storage_bytes: 20 * GI,
714        gpu: None,
715    },
716    InstanceTypeSpec {
717        name: "n2-highmem-8",
718        platform: Platform::Gcp,
719        family: InstanceFamily::MemoryOptimized,
720        architecture: Architecture::X86_64,
721        vcpu: 8,
722        memory_bytes: 64 * GI,
723        ephemeral_storage_bytes: 20 * GI,
724        gpu: None,
725    },
726    InstanceTypeSpec {
727        name: "n2-highmem-16",
728        platform: Platform::Gcp,
729        family: InstanceFamily::MemoryOptimized,
730        architecture: Architecture::X86_64,
731        vcpu: 16,
732        memory_bytes: 128 * GI,
733        ephemeral_storage_bytes: 20 * GI,
734        gpu: None,
735    },
736    InstanceTypeSpec {
737        name: "n2-highmem-32",
738        platform: Platform::Gcp,
739        family: InstanceFamily::MemoryOptimized,
740        architecture: Architecture::X86_64,
741        vcpu: 32,
742        memory_bytes: 256 * GI,
743        ephemeral_storage_bytes: 20 * GI,
744        gpu: None,
745    },
746    // Storage Optimized (c3d-standard with local SSD)
747    InstanceTypeSpec {
748        name: "c3d-standard-8",
749        platform: Platform::Gcp,
750        family: InstanceFamily::StorageOptimized,
751        architecture: Architecture::X86_64,
752        vcpu: 8,
753        memory_bytes: 32 * GI,
754        ephemeral_storage_bytes: 480 * GI,
755        gpu: None,
756    },
757    InstanceTypeSpec {
758        name: "c3d-standard-16",
759        platform: Platform::Gcp,
760        family: InstanceFamily::StorageOptimized,
761        architecture: Architecture::X86_64,
762        vcpu: 16,
763        memory_bytes: 64 * GI,
764        ephemeral_storage_bytes: 960 * GI,
765        gpu: None,
766    },
767    InstanceTypeSpec {
768        name: "c3d-standard-30",
769        platform: Platform::Gcp,
770        family: InstanceFamily::StorageOptimized,
771        architecture: Architecture::X86_64,
772        vcpu: 30,
773        memory_bytes: 120 * GI,
774        ephemeral_storage_bytes: 1920 * GI,
775        gpu: None,
776    },
777    // GPU — NVIDIA T4 (n1-standard + T4)
778    InstanceTypeSpec {
779        name: "n1-standard-4-t4",
780        platform: Platform::Gcp,
781        family: InstanceFamily::GpuCompute,
782        architecture: Architecture::X86_64,
783        vcpu: 4,
784        memory_bytes: 15 * GI,
785        ephemeral_storage_bytes: 100 * GI,
786        gpu: Some(CatalogGpu {
787            gpu_type: "nvidia-t4",
788            count: 1,
789        }),
790    },
791    // GPU — NVIDIA A100 (a2-highgpu)
792    InstanceTypeSpec {
793        name: "a2-highgpu-1g",
794        platform: Platform::Gcp,
795        family: InstanceFamily::GpuCompute,
796        architecture: Architecture::X86_64,
797        vcpu: 12,
798        memory_bytes: 85 * GI,
799        ephemeral_storage_bytes: 100 * GI,
800        gpu: Some(CatalogGpu {
801            gpu_type: "nvidia-a100",
802            count: 1,
803        }),
804    },
805    InstanceTypeSpec {
806        name: "a2-highgpu-8g",
807        platform: Platform::Gcp,
808        family: InstanceFamily::GpuCompute,
809        architecture: Architecture::X86_64,
810        vcpu: 96,
811        memory_bytes: 1360 * GI,
812        ephemeral_storage_bytes: 100 * GI,
813        gpu: Some(CatalogGpu {
814            gpu_type: "nvidia-a100",
815            count: 8,
816        }),
817    },
818    // GPU — NVIDIA H100 (a3-highgpu)
819    InstanceTypeSpec {
820        name: "a3-highgpu-8g",
821        platform: Platform::Gcp,
822        family: InstanceFamily::GpuCompute,
823        architecture: Architecture::X86_64,
824        vcpu: 208,
825        memory_bytes: 1872 * GI,
826        ephemeral_storage_bytes: 100 * GI,
827        gpu: Some(CatalogGpu {
828            gpu_type: "nvidia-h100",
829            count: 8,
830        }),
831    },
832    // =========================================================================
833    // Azure
834    // =========================================================================
835
836    // Burstable (B-series v2)
837    InstanceTypeSpec {
838        name: "Standard_B1s",
839        platform: Platform::Azure,
840        family: InstanceFamily::Burstable,
841        architecture: Architecture::X86_64,
842        vcpu: 1,
843        memory_bytes: 1 * GI,
844        ephemeral_storage_bytes: 20 * GI,
845        gpu: None,
846    },
847    InstanceTypeSpec {
848        name: "Standard_B2s",
849        platform: Platform::Azure,
850        family: InstanceFamily::Burstable,
851        architecture: Architecture::X86_64,
852        vcpu: 2,
853        memory_bytes: 4 * GI,
854        ephemeral_storage_bytes: 20 * GI,
855        gpu: None,
856    },
857    InstanceTypeSpec {
858        name: "Standard_B2ms",
859        platform: Platform::Azure,
860        family: InstanceFamily::Burstable,
861        architecture: Architecture::X86_64,
862        vcpu: 2,
863        memory_bytes: 8 * GI,
864        ephemeral_storage_bytes: 20 * GI,
865        gpu: None,
866    },
867    InstanceTypeSpec {
868        name: "Standard_B4ms",
869        platform: Platform::Azure,
870        family: InstanceFamily::Burstable,
871        architecture: Architecture::X86_64,
872        vcpu: 4,
873        memory_bytes: 16 * GI,
874        ephemeral_storage_bytes: 20 * GI,
875        gpu: None,
876    },
877    // General Purpose (Dv5-series, up to 16 vCPU)
878    InstanceTypeSpec {
879        name: "Standard_D2s_v5",
880        platform: Platform::Azure,
881        family: InstanceFamily::GeneralPurpose,
882        architecture: Architecture::X86_64,
883        vcpu: 2,
884        memory_bytes: 8 * GI,
885        ephemeral_storage_bytes: 20 * GI,
886        gpu: None,
887    },
888    InstanceTypeSpec {
889        name: "Standard_D4s_v5",
890        platform: Platform::Azure,
891        family: InstanceFamily::GeneralPurpose,
892        architecture: Architecture::X86_64,
893        vcpu: 4,
894        memory_bytes: 16 * GI,
895        ephemeral_storage_bytes: 20 * GI,
896        gpu: None,
897    },
898    InstanceTypeSpec {
899        name: "Standard_D8s_v5",
900        platform: Platform::Azure,
901        family: InstanceFamily::GeneralPurpose,
902        architecture: Architecture::X86_64,
903        vcpu: 8,
904        memory_bytes: 32 * GI,
905        ephemeral_storage_bytes: 20 * GI,
906        gpu: None,
907    },
908    InstanceTypeSpec {
909        name: "Standard_D16s_v5",
910        platform: Platform::Azure,
911        family: InstanceFamily::GeneralPurpose,
912        architecture: Architecture::X86_64,
913        vcpu: 16,
914        memory_bytes: 64 * GI,
915        ephemeral_storage_bytes: 20 * GI,
916        gpu: None,
917    },
918    // Compute Optimized (Fv2-series, up to 16 vCPU)
919    InstanceTypeSpec {
920        name: "Standard_F2s_v2",
921        platform: Platform::Azure,
922        family: InstanceFamily::ComputeOptimized,
923        architecture: Architecture::X86_64,
924        vcpu: 2,
925        memory_bytes: 4 * GI,
926        ephemeral_storage_bytes: 20 * GI,
927        gpu: None,
928    },
929    InstanceTypeSpec {
930        name: "Standard_F4s_v2",
931        platform: Platform::Azure,
932        family: InstanceFamily::ComputeOptimized,
933        architecture: Architecture::X86_64,
934        vcpu: 4,
935        memory_bytes: 8 * GI,
936        ephemeral_storage_bytes: 20 * GI,
937        gpu: None,
938    },
939    InstanceTypeSpec {
940        name: "Standard_F8s_v2",
941        platform: Platform::Azure,
942        family: InstanceFamily::ComputeOptimized,
943        architecture: Architecture::X86_64,
944        vcpu: 8,
945        memory_bytes: 16 * GI,
946        ephemeral_storage_bytes: 20 * GI,
947        gpu: None,
948    },
949    InstanceTypeSpec {
950        name: "Standard_F16s_v2",
951        platform: Platform::Azure,
952        family: InstanceFamily::ComputeOptimized,
953        architecture: Architecture::X86_64,
954        vcpu: 16,
955        memory_bytes: 32 * GI,
956        ephemeral_storage_bytes: 20 * GI,
957        gpu: None,
958    },
959    // Memory Optimized (Ev5-series, up to 16 vCPU)
960    InstanceTypeSpec {
961        name: "Standard_E2s_v5",
962        platform: Platform::Azure,
963        family: InstanceFamily::MemoryOptimized,
964        architecture: Architecture::X86_64,
965        vcpu: 2,
966        memory_bytes: 16 * GI,
967        ephemeral_storage_bytes: 20 * GI,
968        gpu: None,
969    },
970    InstanceTypeSpec {
971        name: "Standard_E4s_v5",
972        platform: Platform::Azure,
973        family: InstanceFamily::MemoryOptimized,
974        architecture: Architecture::X86_64,
975        vcpu: 4,
976        memory_bytes: 32 * GI,
977        ephemeral_storage_bytes: 20 * GI,
978        gpu: None,
979    },
980    InstanceTypeSpec {
981        name: "Standard_E8s_v5",
982        platform: Platform::Azure,
983        family: InstanceFamily::MemoryOptimized,
984        architecture: Architecture::X86_64,
985        vcpu: 8,
986        memory_bytes: 64 * GI,
987        ephemeral_storage_bytes: 20 * GI,
988        gpu: None,
989    },
990    InstanceTypeSpec {
991        name: "Standard_E16s_v5",
992        platform: Platform::Azure,
993        family: InstanceFamily::MemoryOptimized,
994        architecture: Architecture::X86_64,
995        vcpu: 16,
996        memory_bytes: 128 * GI,
997        ephemeral_storage_bytes: 20 * GI,
998        gpu: None,
999    },
1000    // Storage Optimized (Lsv3-series with NVMe)
1001    InstanceTypeSpec {
1002        name: "Standard_L8s_v3",
1003        platform: Platform::Azure,
1004        family: InstanceFamily::StorageOptimized,
1005        architecture: Architecture::X86_64,
1006        vcpu: 8,
1007        memory_bytes: 64 * GI,
1008        ephemeral_storage_bytes: 1788 * GI,
1009        gpu: None,
1010    },
1011    InstanceTypeSpec {
1012        name: "Standard_L16s_v3",
1013        platform: Platform::Azure,
1014        family: InstanceFamily::StorageOptimized,
1015        architecture: Architecture::X86_64,
1016        vcpu: 16,
1017        memory_bytes: 128 * GI,
1018        ephemeral_storage_bytes: 3576 * GI,
1019        gpu: None,
1020    },
1021    InstanceTypeSpec {
1022        name: "Standard_L32s_v3",
1023        platform: Platform::Azure,
1024        family: InstanceFamily::StorageOptimized,
1025        architecture: Architecture::X86_64,
1026        vcpu: 32,
1027        memory_bytes: 256 * GI,
1028        ephemeral_storage_bytes: 7154 * GI,
1029        gpu: None,
1030    },
1031    // GPU — NVIDIA T4 (NCasT4_v3-series)
1032    InstanceTypeSpec {
1033        name: "Standard_NC4as_T4_v3",
1034        platform: Platform::Azure,
1035        family: InstanceFamily::GpuCompute,
1036        architecture: Architecture::X86_64,
1037        vcpu: 4,
1038        memory_bytes: 28 * GI,
1039        ephemeral_storage_bytes: 176 * GI,
1040        gpu: Some(CatalogGpu {
1041            gpu_type: "nvidia-t4",
1042            count: 1,
1043        }),
1044    },
1045    // GPU — NVIDIA A100 (NC A100 v4-series)
1046    InstanceTypeSpec {
1047        name: "Standard_NC24ads_A100_v4",
1048        platform: Platform::Azure,
1049        family: InstanceFamily::GpuCompute,
1050        architecture: Architecture::X86_64,
1051        vcpu: 24,
1052        memory_bytes: 220 * GI,
1053        ephemeral_storage_bytes: 958 * GI,
1054        gpu: Some(CatalogGpu {
1055            gpu_type: "nvidia-a100",
1056            count: 1,
1057        }),
1058    },
1059    InstanceTypeSpec {
1060        name: "Standard_NC96ads_A100_v4",
1061        platform: Platform::Azure,
1062        family: InstanceFamily::GpuCompute,
1063        architecture: Architecture::X86_64,
1064        vcpu: 96,
1065        memory_bytes: 880 * GI,
1066        ephemeral_storage_bytes: 3916 * GI,
1067        gpu: Some(CatalogGpu {
1068            gpu_type: "nvidia-a100",
1069            count: 4,
1070        }),
1071    },
1072    // GPU — NVIDIA H100 (ND H100 v5-series)
1073    InstanceTypeSpec {
1074        name: "Standard_ND96isr_H100_v5",
1075        platform: Platform::Azure,
1076        family: InstanceFamily::GpuCompute,
1077        architecture: Architecture::X86_64,
1078        vcpu: 96,
1079        memory_bytes: 1900 * GI,
1080        ephemeral_storage_bytes: 1000 * GI,
1081        gpu: Some(CatalogGpu {
1082            gpu_type: "nvidia-h100",
1083            count: 8,
1084        }),
1085    },
1086];
1087
1088// ---------------------------------------------------------------------------
1089// Catalog lookup
1090// ---------------------------------------------------------------------------
1091
1092/// Get all instance types for a given platform.
1093pub fn catalog_for_platform(platform: Platform) -> Vec<&'static InstanceTypeSpec> {
1094    CATALOG
1095        .iter()
1096        .filter(|spec| spec.platform == platform)
1097        .collect()
1098}
1099
1100/// Find a specific instance type by name and platform.
1101pub fn find_instance_type(platform: Platform, name: &str) -> Option<&'static InstanceTypeSpec> {
1102    CATALOG
1103        .iter()
1104        .find(|spec| spec.platform == platform && spec.name == name)
1105}
1106
1107// ---------------------------------------------------------------------------
1108// Instance type selection
1109// ---------------------------------------------------------------------------
1110
1111/// Aggregated resource requirements from all containers in a capacity group.
1112#[derive(Debug, Clone)]
1113pub struct WorkloadRequirements {
1114    /// Total CPU needed at desired scale (sum of desired CPU * desired_replicas per container)
1115    pub total_cpu_at_desired: f64,
1116    /// Total memory needed at desired scale (sum of desired memory * desired_replicas per container)
1117    pub total_memory_bytes_at_desired: u64,
1118    /// Total CPU needed at maximum scale (sum of desired CPU * max_replicas per container)
1119    pub total_cpu_at_max: f64,
1120    /// Total memory needed at maximum scale (sum of desired memory * max_replicas per container)
1121    pub total_memory_bytes_at_max: u64,
1122    /// Largest CPU request among all individual containers (single replica)
1123    pub max_cpu_per_container: f64,
1124    /// Largest memory request among all individual containers (single replica)
1125    pub max_memory_per_container: u64,
1126    /// Maximum ephemeral storage any single container requires
1127    pub max_ephemeral_storage_bytes: u64,
1128    /// GPU requirement (if any container needs GPU)
1129    pub gpu: Option<GpuSpec>,
1130    /// Required CPU architecture, when source explicitly constrains it.
1131    pub architecture: Option<Architecture>,
1132    /// If true, only instance types that expose nested virtualization (VT-x/EPT)
1133    /// to guest VMs are eligible. Required by workloads that run QEMU/KVM
1134    /// inside a container.
1135    pub nested_virt: bool,
1136}
1137
1138/// Result of instance type selection.
1139#[derive(Debug, Clone)]
1140pub struct InstanceSelection {
1141    /// Selected instance type name (e.g., "m7g.2xlarge")
1142    pub instance_type: &'static str,
1143    /// Machine profile derived from the instance type
1144    pub profile: MachineProfile,
1145    /// Recommended minimum number of machines
1146    pub min_machines: u32,
1147    /// Recommended maximum number of machines
1148    pub max_machines: u32,
1149}
1150
1151/// Ephemeral storage threshold above which storage-optimized instances are selected.
1152const STORAGE_OPTIMIZED_THRESHOLD: u64 = 200 * GI;
1153
1154/// Maximum number of machines per cluster.
1155const MAX_MACHINES_PER_CLUSTER: u32 = 10;
1156
1157/// Hard cap on vCPUs for non-GPU/non-storage workloads. Equivalent to AWS 2xlarge.
1158/// Beyond this, horizontal scaling is always preferred over bigger machines.
1159const MAX_STANDARD_VCPU: u32 = 8;
1160
1161/// How many of the largest standard container we want to fit per machine.
1162const STANDARD_CONTAINERS_PER_MACHINE: f64 = 2.0;
1163
1164/// Overhead factor for system processes and bin-packing inefficiency.
1165const OVERHEAD_FACTOR: f64 = 1.25;
1166
1167/// Runtime CPU reserved for system processes on each managed container machine.
1168const SYSTEM_RESERVE_CPU: f64 = 0.5;
1169
1170/// Runtime planning headroom for total desired/max workload.
1171const WORKLOAD_HEADROOM_FACTOR: f64 = 1.15;
1172
1173/// Select the best instance type for a workload on a given platform.
1174///
1175/// The algorithm:
1176/// 1. GPU workloads: Match by GPU type, find smallest instance with enough GPUs.
1177/// 2. Storage-heavy workloads (>200Gi ephemeral): Use storage-optimized instances.
1178/// 3. All other workloads: Size the machine to fit a small HA-friendly baseline,
1179///    capped at 8 vCPUs. Use GeneralPurpose family for broad availability and
1180///    reasonable cost. Scale horizontally for more capacity.
1181///
1182/// Returns an error if no suitable instance type is found.
1183pub fn select_instance_type(
1184    platform: Platform,
1185    requirements: &WorkloadRequirements,
1186) -> Result<InstanceSelection, String> {
1187    // Determine which family to use. Nested virt isn't available on
1188    // burstable hardware on any cloud, so a workload that classifies as
1189    // Burstable but needs nested virt must be upgraded to GeneralPurpose
1190    // (the family that actually has nested-virt-capable entries).
1191    let raw_family = select_family(requirements);
1192    let family = if requirements.nested_virt && raw_family == InstanceFamily::Burstable {
1193        InstanceFamily::GeneralPurpose
1194    } else {
1195        raw_family
1196    };
1197
1198    // Filter catalog to matching platform + family. Nested-virt-capable
1199    // entries (m8i/c8i/r8i on AWS) are NESTED-VIRT-ONLY: they're added
1200    // exclusively to satisfy `CpuOptions.NestedVirtualization=enabled`
1201    // workloads and cost more than the Graviton default. We include them
1202    // ONLY when `requirements.nested_virt` is true, and exclude them
1203    // otherwise so the cost-efficient default (e.g. m7g) keeps winning
1204    // for ordinary workloads.
1205    let candidates: Vec<&InstanceTypeSpec> = CATALOG
1206        .iter()
1207        .filter(|spec| spec.platform == platform && spec.family == family)
1208        .filter(|spec| match requirements.architecture {
1209            Some(architecture) => spec.architecture == architecture,
1210            None => true,
1211        })
1212        .filter(|spec| {
1213            if requirements.nested_virt {
1214                spec.is_nested_virt_capable()
1215            } else {
1216                !spec.is_nested_virt_capable()
1217            }
1218        })
1219        .collect();
1220
1221    if candidates.is_empty() {
1222        return Err(if requirements.nested_virt {
1223            format!(
1224                "no nested-virt-capable {family:?} instance types in catalog for platform {platform}; \
1225                 only 8th-gen Intel families (m8i/c8i/r8i) support nested virtualization on AWS"
1226            )
1227        } else {
1228            format!("no {family:?} instance types in catalog for platform {platform}")
1229        });
1230    }
1231
1232    // For GPU workloads, filter by GPU type
1233    let candidates = if let Some(ref gpu) = requirements.gpu {
1234        let filtered: Vec<&InstanceTypeSpec> = candidates
1235            .into_iter()
1236            .filter(|spec| {
1237                spec.gpu.as_ref().map_or(false, |g| {
1238                    g.gpu_type == gpu.gpu_type && g.count >= gpu.count
1239                })
1240            })
1241            .collect();
1242        if filtered.is_empty() {
1243            return Err(format!(
1244                "no instance type for GPU type '{}' x{} on platform {platform}",
1245                gpu.gpu_type, gpu.count
1246            ));
1247        }
1248        filtered
1249    } else {
1250        candidates
1251    };
1252
1253    // For storage workloads, filter by ephemeral storage capacity
1254    let candidates = if family == InstanceFamily::StorageOptimized {
1255        let filtered: Vec<&InstanceTypeSpec> = candidates
1256            .into_iter()
1257            .filter(|spec| spec.ephemeral_storage_bytes >= requirements.max_ephemeral_storage_bytes)
1258            .collect();
1259        if filtered.is_empty() {
1260            return Err(format!(
1261                "no storage-optimized instance with >= {} bytes ephemeral storage on platform {platform}",
1262                requirements.max_ephemeral_storage_bytes
1263            ));
1264        }
1265        filtered
1266    } else {
1267        candidates
1268    };
1269
1270    // Cap at MAX_STANDARD_VCPU for non-GPU/non-storage workloads
1271    let vcpu_cap =
1272        if family == InstanceFamily::GpuCompute || family == InstanceFamily::StorageOptimized {
1273            u32::MAX
1274        } else {
1275            MAX_STANDARD_VCPU
1276        };
1277
1278    let desired_target_machines = desired_target_machines(requirements);
1279    let target_cpu =
1280        (requirements.max_cpu_per_container * STANDARD_CONTAINERS_PER_MACHINE * OVERHEAD_FACTOR)
1281            .max(
1282                requirements.total_cpu_at_desired * WORKLOAD_HEADROOM_FACTOR
1283                    / desired_target_machines as f64,
1284            )
1285            .max(0.25);
1286    let target_memory = (requirements.max_memory_per_container as f64
1287        * STANDARD_CONTAINERS_PER_MACHINE
1288        * OVERHEAD_FACTOR)
1289        .max(
1290            requirements.total_memory_bytes_at_desired as f64 * WORKLOAD_HEADROOM_FACTOR
1291                / desired_target_machines as f64,
1292        )
1293        .max(256.0 * MI as f64);
1294
1295    // Find the smallest instance that meets per-container targets within the cap.
1296    let selected = candidates
1297        .iter()
1298        .filter(|spec| {
1299            spec.vcpu <= vcpu_cap
1300                && spec.vcpu as f64 >= target_cpu
1301                && spec.memory_bytes as f64 >= target_memory
1302        })
1303        .min_by_key(|spec| spec.vcpu)
1304        .or_else(|| {
1305            // If nothing fits within the cap, pick the largest instance under the cap
1306            candidates
1307                .iter()
1308                .filter(|spec| spec.vcpu <= vcpu_cap)
1309                .max_by_key(|spec| spec.vcpu)
1310        })
1311        .or_else(|| {
1312            // Last resort: pick the smallest available instance (for GPU/storage)
1313            candidates.iter().min_by_key(|spec| spec.vcpu)
1314        })
1315        .ok_or_else(|| format!("no instance types available for platform {platform}"))?;
1316
1317    // Calculate machine counts
1318    let max_machines = compute_max_machines(requirements, selected);
1319    let min_machines = compute_min_machines(requirements, selected, max_machines);
1320
1321    Ok(InstanceSelection {
1322        instance_type: selected.name,
1323        profile: selected.to_machine_profile(),
1324        min_machines,
1325        max_machines,
1326    })
1327}
1328
1329/// Select instance family based on workload characteristics.
1330///
1331/// Uses GeneralPurpose for all standard workloads — widely available across
1332/// regions and cost-effective. Only specialized workloads (GPU, large ephemeral
1333/// storage) get specialized families. Very small workloads get burstable.
1334pub fn select_family(requirements: &WorkloadRequirements) -> InstanceFamily {
1335    // GPU workloads always get GPU instances
1336    if requirements.gpu.is_some() {
1337        return InstanceFamily::GpuCompute;
1338    }
1339
1340    // Large ephemeral storage needs NVMe (storage-optimized)
1341    if requirements.max_ephemeral_storage_bytes > STORAGE_OPTIMIZED_THRESHOLD {
1342        return InstanceFamily::StorageOptimized;
1343    }
1344
1345    // Very small workloads use burstable instances
1346    if requirements.total_cpu_at_max < 2.0 {
1347        return InstanceFamily::Burstable;
1348    }
1349
1350    // All other workloads use GeneralPurpose — available everywhere, good pricing
1351    InstanceFamily::GeneralPurpose
1352}
1353
1354/// Calculate maximum machines needed to fit the workload with headroom.
1355fn compute_max_machines(requirements: &WorkloadRequirements, instance: &InstanceTypeSpec) -> u32 {
1356    let cpu_with_headroom = requirements.total_cpu_at_max * WORKLOAD_HEADROOM_FACTOR;
1357    let cpu_machines = (cpu_with_headroom / allocatable_cpu(instance)).ceil() as u32;
1358
1359    let mem_with_headroom =
1360        requirements.total_memory_bytes_at_max as f64 * WORKLOAD_HEADROOM_FACTOR;
1361    let mem_machines =
1362        (mem_with_headroom / allocatable_memory_bytes(instance) as f64).ceil() as u32;
1363
1364    // Take the larger of CPU-based and memory-based, clamped to cluster limit
1365    cpu_machines
1366        .max(mem_machines)
1367        .max(1)
1368        .min(MAX_MACHINES_PER_CLUSTER)
1369}
1370
1371/// Calculate minimum machines for HA.
1372fn compute_min_machines(
1373    requirements: &WorkloadRequirements,
1374    instance: &InstanceTypeSpec,
1375    max_machines: u32,
1376) -> u32 {
1377    let cpu_with_headroom = requirements.total_cpu_at_desired * WORKLOAD_HEADROOM_FACTOR;
1378    let cpu_machines = (cpu_with_headroom / allocatable_cpu(instance)).ceil() as u32;
1379
1380    let mem_with_headroom =
1381        requirements.total_memory_bytes_at_desired as f64 * WORKLOAD_HEADROOM_FACTOR;
1382    let mem_machines =
1383        (mem_with_headroom / allocatable_memory_bytes(instance) as f64).ceil() as u32;
1384
1385    cpu_machines
1386        .max(mem_machines)
1387        .max(1)
1388        .min(2)
1389        .min(max_machines)
1390}
1391
1392fn desired_target_machines(requirements: &WorkloadRequirements) -> u32 {
1393    if requirements.total_cpu_at_desired >= 2.0
1394        || requirements.total_memory_bytes_at_desired >= 4 * GI
1395    {
1396        2
1397    } else {
1398        1
1399    }
1400}
1401
1402fn allocatable_cpu(instance: &InstanceTypeSpec) -> f64 {
1403    (instance.vcpu as f64 - SYSTEM_RESERVE_CPU).max(0.25)
1404}
1405
1406fn allocatable_memory_bytes(instance: &InstanceTypeSpec) -> u64 {
1407    instance
1408        .memory_bytes
1409        .saturating_sub(system_reserve_memory_bytes(instance.memory_bytes))
1410        .max(256 * MI)
1411}
1412
1413fn system_reserve_memory_bytes(memory_bytes: u64) -> u64 {
1414    if memory_bytes < 4 * GI {
1415        256 * MI
1416    } else if memory_bytes < 16 * GI {
1417        512 * MI
1418    } else {
1419        GI
1420    }
1421}
1422
1423// ---------------------------------------------------------------------------
1424// Tests
1425// ---------------------------------------------------------------------------
1426
1427#[cfg(test)]
1428mod tests {
1429    use super::*;
1430
1431    // -- Parsing tests --
1432
1433    #[test]
1434    fn test_parse_cpu_plain() {
1435        assert_eq!(parse_cpu("1").unwrap(), 1.0);
1436        assert_eq!(parse_cpu("0.5").unwrap(), 0.5);
1437        assert_eq!(parse_cpu("2.0").unwrap(), 2.0);
1438        assert_eq!(parse_cpu("16").unwrap(), 16.0);
1439    }
1440
1441    #[test]
1442    fn test_parse_cpu_millicore() {
1443        assert_eq!(parse_cpu("500m").unwrap(), 0.5);
1444        assert_eq!(parse_cpu("250m").unwrap(), 0.25);
1445        assert_eq!(parse_cpu("1000m").unwrap(), 1.0);
1446        assert_eq!(parse_cpu("100m").unwrap(), 0.1);
1447    }
1448
1449    #[test]
1450    fn test_parse_cpu_invalid() {
1451        assert!(parse_cpu("").is_err());
1452        assert!(parse_cpu("abc").is_err());
1453        assert!(parse_cpu("m").is_err());
1454    }
1455
1456    #[test]
1457    fn test_parse_memory_binary_suffixes() {
1458        assert_eq!(parse_memory_bytes("1Ki").unwrap(), 1024);
1459        assert_eq!(parse_memory_bytes("1Mi").unwrap(), 1024 * 1024);
1460        assert_eq!(parse_memory_bytes("1Gi").unwrap(), 1024 * 1024 * 1024);
1461        assert_eq!(parse_memory_bytes("4Gi").unwrap(), 4 * 1024 * 1024 * 1024);
1462        assert_eq!(parse_memory_bytes("512Mi").unwrap(), 512 * 1024 * 1024);
1463        assert_eq!(
1464            parse_memory_bytes("1Ti").unwrap(),
1465            1024u64 * 1024 * 1024 * 1024
1466        );
1467    }
1468
1469    #[test]
1470    fn test_parse_memory_decimal_suffixes() {
1471        assert_eq!(parse_memory_bytes("1k").unwrap(), 1000);
1472        assert_eq!(parse_memory_bytes("1M").unwrap(), 1_000_000);
1473        assert_eq!(parse_memory_bytes("1G").unwrap(), 1_000_000_000);
1474        assert_eq!(parse_memory_bytes("1T").unwrap(), 1_000_000_000_000);
1475    }
1476
1477    #[test]
1478    fn test_parse_memory_plain_bytes() {
1479        assert_eq!(parse_memory_bytes("1024").unwrap(), 1024);
1480        assert_eq!(parse_memory_bytes("0").unwrap(), 0);
1481    }
1482
1483    #[test]
1484    fn test_parse_memory_invalid() {
1485        assert!(parse_memory_bytes("").is_err());
1486        assert!(parse_memory_bytes("abc").is_err());
1487        assert!(parse_memory_bytes("Gi").is_err());
1488    }
1489
1490    #[test]
1491    fn test_parse_memory_fractional() {
1492        assert_eq!(parse_memory_bytes("0.5Gi").unwrap(), GI / 2);
1493        assert_eq!(parse_memory_bytes("1.5Gi").unwrap(), GI + GI / 2);
1494    }
1495
1496    // -- Catalog lookup tests --
1497
1498    #[test]
1499    fn test_catalog_has_entries_for_all_cloud_platforms() {
1500        assert!(!catalog_for_platform(Platform::Aws).is_empty());
1501        assert!(!catalog_for_platform(Platform::Gcp).is_empty());
1502        assert!(!catalog_for_platform(Platform::Azure).is_empty());
1503    }
1504
1505    #[test]
1506    fn test_catalog_no_entries_for_non_cloud_platforms() {
1507        assert!(catalog_for_platform(Platform::Local).is_empty());
1508        assert!(catalog_for_platform(Platform::Kubernetes).is_empty());
1509    }
1510
1511    #[test]
1512    fn test_find_known_instance_type() {
1513        let spec =
1514            find_instance_type(Platform::Aws, "m7g.2xlarge").expect("should find m7g.2xlarge");
1515        assert_eq!(spec.vcpu, 8);
1516        assert_eq!(spec.memory_bytes, 32 * GI);
1517        assert_eq!(spec.family, InstanceFamily::GeneralPurpose);
1518    }
1519
1520    #[test]
1521    fn test_find_aws_c8i_nested_virt_instance_type() {
1522        let spec = find_instance_type(Platform::Aws, "c8i.large").expect("should find c8i.large");
1523        assert_eq!(spec.vcpu, 2);
1524        assert_eq!(spec.memory_bytes, 4 * GI);
1525        assert_eq!(spec.family, InstanceFamily::ComputeOptimized);
1526        assert_eq!(spec.architecture, Architecture::X86_64);
1527        assert!(spec.is_nested_virt_capable());
1528    }
1529
1530    #[test]
1531    fn test_find_unknown_instance_type() {
1532        assert!(find_instance_type(Platform::Aws, "nonexistent.xlarge").is_none());
1533    }
1534
1535    #[test]
1536    fn test_find_wrong_platform() {
1537        assert!(find_instance_type(Platform::Gcp, "m7g.2xlarge").is_none());
1538    }
1539
1540    #[test]
1541    fn test_to_machine_profile() {
1542        let spec = find_instance_type(Platform::Aws, "m7g.2xlarge").unwrap();
1543        let profile = spec.to_machine_profile();
1544        assert_eq!(profile.cpu, "8.0");
1545        assert_eq!(profile.memory_bytes, 32 * GI);
1546        assert_eq!(profile.ephemeral_storage_bytes, 20 * GI);
1547        assert!(profile.gpu.is_none());
1548    }
1549
1550    #[test]
1551    fn test_to_machine_profile_with_gpu() {
1552        let spec = find_instance_type(Platform::Aws, "p4d.24xlarge").unwrap();
1553        let profile = spec.to_machine_profile();
1554        let gpu = profile.gpu.as_ref().expect("should have GPU");
1555        assert_eq!(gpu.gpu_type, "nvidia-a100");
1556        assert_eq!(gpu.count, 8);
1557    }
1558
1559    // -- Selection algorithm tests --
1560
1561    #[test]
1562    fn test_select_burstable_for_small_workload() {
1563        let req = WorkloadRequirements {
1564            total_cpu_at_desired: 1.0,
1565            total_memory_bytes_at_desired: 2 * GI,
1566            total_cpu_at_max: 1.0,
1567            total_memory_bytes_at_max: 2 * GI,
1568            max_cpu_per_container: 0.5,
1569            max_memory_per_container: 1 * GI,
1570            max_ephemeral_storage_bytes: 10 * GI,
1571            gpu: None,
1572            architecture: None,
1573            nested_virt: false,
1574        };
1575        let sel = select_instance_type(Platform::Aws, &req).unwrap();
1576        let spec = find_instance_type(Platform::Aws, sel.instance_type).unwrap();
1577        assert_eq!(spec.family, InstanceFamily::Burstable);
1578    }
1579
1580    #[test]
1581    fn test_select_general_purpose_for_standard_workload() {
1582        // Standard workloads always get GeneralPurpose regardless of CPU:memory ratio
1583        let req = WorkloadRequirements {
1584            total_cpu_at_desired: 20.0,
1585            total_memory_bytes_at_desired: 80 * GI,
1586            total_cpu_at_max: 20.0,
1587            total_memory_bytes_at_max: 80 * GI,
1588            max_cpu_per_container: 2.0,
1589            max_memory_per_container: 8 * GI,
1590            max_ephemeral_storage_bytes: 10 * GI,
1591            gpu: None,
1592            architecture: None,
1593            nested_virt: false,
1594        };
1595        let sel = select_instance_type(Platform::Aws, &req).unwrap();
1596        let spec = find_instance_type(Platform::Aws, sel.instance_type).unwrap();
1597        assert_eq!(spec.family, InstanceFamily::GeneralPurpose);
1598    }
1599
1600    #[test]
1601    fn test_select_general_purpose_even_for_cpu_heavy() {
1602        // CPU-heavy workloads still get GeneralPurpose (no more ComputeOptimized auto-select)
1603        let req = WorkloadRequirements {
1604            total_cpu_at_desired: 20.0,
1605            total_memory_bytes_at_desired: 20 * GI,
1606            total_cpu_at_max: 20.0,
1607            total_memory_bytes_at_max: 20 * GI,
1608            max_cpu_per_container: 2.0,
1609            max_memory_per_container: 2 * GI,
1610            max_ephemeral_storage_bytes: 10 * GI,
1611            gpu: None,
1612            architecture: None,
1613            nested_virt: false,
1614        };
1615        let sel = select_instance_type(Platform::Aws, &req).unwrap();
1616        let spec = find_instance_type(Platform::Aws, sel.instance_type).unwrap();
1617        assert_eq!(spec.family, InstanceFamily::GeneralPurpose);
1618    }
1619
1620    #[test]
1621    fn test_select_storage_optimized_for_large_ephemeral() {
1622        let req = WorkloadRequirements {
1623            total_cpu_at_desired: 8.0,
1624            total_memory_bytes_at_desired: 32 * GI,
1625            total_cpu_at_max: 8.0,
1626            total_memory_bytes_at_max: 32 * GI,
1627            max_cpu_per_container: 2.0,
1628            max_memory_per_container: 8 * GI,
1629            max_ephemeral_storage_bytes: 500 * GI,
1630            gpu: None,
1631            architecture: None,
1632            nested_virt: false,
1633        };
1634        let sel = select_instance_type(Platform::Aws, &req).unwrap();
1635        let spec = find_instance_type(Platform::Aws, sel.instance_type).unwrap();
1636        assert_eq!(spec.family, InstanceFamily::StorageOptimized);
1637    }
1638
1639    #[test]
1640    fn test_select_gpu_instance() {
1641        let req = WorkloadRequirements {
1642            total_cpu_at_desired: 8.0,
1643            total_memory_bytes_at_desired: 32 * GI,
1644            total_cpu_at_max: 8.0,
1645            total_memory_bytes_at_max: 32 * GI,
1646            max_cpu_per_container: 4.0,
1647            max_memory_per_container: 16 * GI,
1648            max_ephemeral_storage_bytes: 10 * GI,
1649            gpu: Some(GpuSpec {
1650                gpu_type: "nvidia-a100".to_string(),
1651                count: 1,
1652            }),
1653            architecture: None,
1654            nested_virt: false,
1655        };
1656        let sel = select_instance_type(Platform::Aws, &req).unwrap();
1657        let spec = find_instance_type(Platform::Aws, sel.instance_type).unwrap();
1658        assert_eq!(spec.family, InstanceFamily::GpuCompute);
1659        assert!(spec.gpu.is_some());
1660    }
1661
1662    #[test]
1663    fn test_select_works_for_all_cloud_platforms() {
1664        let req = WorkloadRequirements {
1665            total_cpu_at_desired: 4.0,
1666            total_memory_bytes_at_desired: 16 * GI,
1667            total_cpu_at_max: 4.0,
1668            total_memory_bytes_at_max: 16 * GI,
1669            max_cpu_per_container: 1.0,
1670            max_memory_per_container: 4 * GI,
1671            max_ephemeral_storage_bytes: 10 * GI,
1672            gpu: None,
1673            architecture: None,
1674            nested_virt: false,
1675        };
1676        for platform in [Platform::Aws, Platform::Gcp, Platform::Azure] {
1677            let sel = select_instance_type(platform, &req);
1678            assert!(sel.is_ok(), "selection failed for {platform}");
1679        }
1680    }
1681
1682    #[test]
1683    fn test_machine_count_reasonable() {
1684        // Single container: 1 CPU, 2Gi, maxReplicas=20
1685        let req = WorkloadRequirements {
1686            total_cpu_at_desired: 20.0,
1687            total_memory_bytes_at_desired: 40 * GI,
1688            total_cpu_at_max: 20.0,
1689            total_memory_bytes_at_max: 40 * GI,
1690            max_cpu_per_container: 1.0,
1691            max_memory_per_container: 2 * GI,
1692            max_ephemeral_storage_bytes: 10 * GI,
1693            gpu: None,
1694            architecture: None,
1695            nested_virt: false,
1696        };
1697        let sel = select_instance_type(Platform::Aws, &req).unwrap();
1698        assert!(sel.min_machines >= 1);
1699        assert!(sel.max_machines <= MAX_MACHINES_PER_CLUSTER);
1700        assert!(sel.max_machines >= sel.min_machines);
1701    }
1702
1703    #[test]
1704    fn test_instance_size_capped_at_8_vcpu() {
1705        // Even with very large containers, instance size is capped at 8 vCPUs
1706        let req = WorkloadRequirements {
1707            total_cpu_at_desired: 70.0,
1708            total_memory_bytes_at_desired: 140 * GI,
1709            total_cpu_at_max: 70.0,
1710            total_memory_bytes_at_max: 140 * GI,
1711            max_cpu_per_container: 2.0,
1712            max_memory_per_container: 4 * GI,
1713            max_ephemeral_storage_bytes: 10 * GI,
1714            gpu: None,
1715            architecture: None,
1716            nested_virt: false,
1717        };
1718        let sel = select_instance_type(Platform::Gcp, &req).unwrap();
1719        let spec = find_instance_type(Platform::Gcp, sel.instance_type).unwrap();
1720        assert!(
1721            spec.vcpu <= MAX_STANDARD_VCPU,
1722            "selected {} with {} vCPUs, expected <= {}",
1723            spec.name,
1724            spec.vcpu,
1725            MAX_STANDARD_VCPU
1726        );
1727        assert_eq!(spec.family, InstanceFamily::GeneralPurpose);
1728        // Should scale horizontally instead
1729        assert!(sel.max_machines > 1);
1730    }
1731
1732    #[test]
1733    fn test_larger_autoscaled_workload_gets_reasonable_instance() {
1734        // Simulates a larger autoscaled workload: 4 containers, each 2 CPU / 4 GiB
1735        // maxReplicas: 10, 10, 10, 5
1736        let req = WorkloadRequirements {
1737            total_cpu_at_desired: 70.0,
1738            total_memory_bytes_at_desired: 140 * GI,
1739            total_cpu_at_max: 70.0,              // 2*10 + 2*10 + 2*10 + 2*5
1740            total_memory_bytes_at_max: 140 * GI, // 4*10 + 4*10 + 4*10 + 4*5
1741            max_cpu_per_container: 2.0,
1742            max_memory_per_container: 4 * GI,
1743            max_ephemeral_storage_bytes: 20 * GI,
1744            gpu: None,
1745            architecture: None,
1746            nested_virt: false,
1747        };
1748        let sel = select_instance_type(Platform::Gcp, &req).unwrap();
1749        // Should pick n2-standard-8 (8 vCPU, 32 GiB) — NOT c3-standard-44
1750        assert_eq!(sel.instance_type, "n2-standard-8");
1751        assert!(sel.max_machines >= 2);
1752    }
1753
1754    /// When `nested_virt` is set on the workload, the selector must
1755    /// restrict to nested-virt-capable families. On AWS that means an m8i
1756    /// (or other 8th-gen Intel) entry, never a Graviton (`*7g`, `t4g`) or
1757    /// burstable. Without this filter the launch template gets created
1758    /// with `CpuOptions.NestedVirtualization=enabled` paired with an
1759    /// instance type AWS rejects at RunInstances.
1760    #[test]
1761    fn test_select_aws_picks_m8i_when_nested_virt_required() {
1762        let req = WorkloadRequirements {
1763            total_cpu_at_desired: 4.0,
1764            total_memory_bytes_at_desired: 8 * GI,
1765            total_cpu_at_max: 4.0,
1766            total_memory_bytes_at_max: 8 * GI,
1767            max_cpu_per_container: 4.0,
1768            max_memory_per_container: 8 * GI,
1769            max_ephemeral_storage_bytes: 10 * GI,
1770            gpu: None,
1771            architecture: None,
1772            nested_virt: true,
1773        };
1774        let sel = select_instance_type(Platform::Aws, &req).unwrap();
1775        assert!(
1776            sel.instance_type.starts_with("m8i.")
1777                || sel.instance_type.starts_with("c8i.")
1778                || sel.instance_type.starts_with("r8i."),
1779            "expected an m8i/c8i/r8i instance, got {}",
1780            sel.instance_type
1781        );
1782        let spec = find_instance_type(Platform::Aws, sel.instance_type).unwrap();
1783        assert!(spec.is_nested_virt_capable());
1784    }
1785
1786    /// Negative case: with `nested_virt = false`, the selector should
1787    /// continue picking the cost-efficient Graviton default, unchanged
1788    /// from prior behavior.
1789    #[test]
1790    fn test_select_aws_falls_back_to_graviton_without_nested_virt() {
1791        let req = WorkloadRequirements {
1792            total_cpu_at_desired: 4.0,
1793            total_memory_bytes_at_desired: 8 * GI,
1794            total_cpu_at_max: 4.0,
1795            total_memory_bytes_at_max: 8 * GI,
1796            max_cpu_per_container: 4.0,
1797            max_memory_per_container: 8 * GI,
1798            max_ephemeral_storage_bytes: 10 * GI,
1799            gpu: None,
1800            architecture: None,
1801            nested_virt: false,
1802        };
1803        let sel = select_instance_type(Platform::Aws, &req).unwrap();
1804        assert!(
1805            sel.instance_type.starts_with("m7g.") || sel.instance_type.starts_with("t4g."),
1806            "expected Graviton default, got {}",
1807            sel.instance_type
1808        );
1809    }
1810
1811    #[test]
1812    fn test_profile_has_required_fields() {
1813        let req = WorkloadRequirements {
1814            total_cpu_at_desired: 4.0,
1815            total_memory_bytes_at_desired: 16 * GI,
1816            total_cpu_at_max: 4.0,
1817            total_memory_bytes_at_max: 16 * GI,
1818            max_cpu_per_container: 1.0,
1819            max_memory_per_container: 4 * GI,
1820            max_ephemeral_storage_bytes: 10 * GI,
1821            gpu: None,
1822            architecture: None,
1823            nested_virt: false,
1824        };
1825        let sel = select_instance_type(Platform::Aws, &req).unwrap();
1826        assert!(!sel.profile.cpu.is_empty());
1827        assert!(sel.profile.memory_bytes > 0);
1828        assert!(sel.profile.ephemeral_storage_bytes > 0);
1829    }
1830
1831    #[test]
1832    fn test_error_for_unsupported_gpu_type() {
1833        let req = WorkloadRequirements {
1834            total_cpu_at_desired: 8.0,
1835            total_memory_bytes_at_desired: 32 * GI,
1836            total_cpu_at_max: 8.0,
1837            total_memory_bytes_at_max: 32 * GI,
1838            max_cpu_per_container: 4.0,
1839            max_memory_per_container: 16 * GI,
1840            max_ephemeral_storage_bytes: 10 * GI,
1841            gpu: Some(GpuSpec {
1842                gpu_type: "amd-mi300".to_string(),
1843                count: 1,
1844            }),
1845            architecture: None,
1846            nested_virt: false,
1847        };
1848        let result = select_instance_type(Platform::Aws, &req);
1849        assert!(result.is_err());
1850    }
1851
1852    #[test]
1853    fn test_catalog_instance_types_sorted_by_vcpu_within_family() {
1854        // Verify that within each (platform, family) group, vcpu is non-decreasing.
1855        // This ensures our "min_by_key(vcpu)" logic works correctly.
1856        for platform in [Platform::Aws, Platform::Gcp, Platform::Azure] {
1857            let entries = catalog_for_platform(platform);
1858            let mut by_family: std::collections::HashMap<_, Vec<_>> =
1859                std::collections::HashMap::new();
1860            for entry in entries {
1861                by_family
1862                    .entry(format!("{:?}", entry.family))
1863                    .or_default()
1864                    .push(entry);
1865            }
1866            for (family, instances) in &by_family {
1867                for window in instances.windows(2) {
1868                    assert!(
1869                        window[0].vcpu <= window[1].vcpu,
1870                        "catalog not sorted by vcpu for {platform}/{family}: {} ({}) > {} ({})",
1871                        window[0].name,
1872                        window[0].vcpu,
1873                        window[1].name,
1874                        window[1].vcpu
1875                    );
1876                }
1877            }
1878        }
1879    }
1880}