alien_core/
instance_catalog.rs

1//! Instance type catalog and selection algorithm for cloud compute infrastructure.
2//!
3//! This module provides:
4//! - A static catalog of known instance types across AWS, GCP, and Azure
5//! - Resource quantity parsing (CPU strings, Kubernetes-style memory/storage quantities)
6//! - An algorithm to select the optimal instance type for a given workload
7//!
8//! The catalog is the single source of truth for instance type specifications.
9//! It is used by the preflights system to automatically populate `CapacityGroup.instance_type`
10//! and `CapacityGroup.profile` based on the containers in a stack.
11
12use crate::{GpuSpec, MachineProfile, Platform};
13
14// ---------------------------------------------------------------------------
15// Resource quantity parsing
16// ---------------------------------------------------------------------------
17
18/// Parse a CPU quantity string to f64.
19///
20/// Accepts plain numbers ("1", "0.5", "2.0") and millicore suffixes ("500m" = 0.5).
21pub fn parse_cpu(s: &str) -> Result<f64, String> {
22    let s = s.trim();
23    if s.is_empty() {
24        return Err("empty CPU string".to_string());
25    }
26
27    if let Some(millis) = s.strip_suffix('m') {
28        let v: f64 = millis
29            .parse()
30            .map_err(|_| format!("invalid CPU millicore value: '{s}'"))?;
31        Ok(v / 1000.0)
32    } else {
33        s.parse().map_err(|_| format!("invalid CPU value: '{s}'"))
34    }
35}
36
37/// Parse a memory or storage quantity string to bytes.
38///
39/// Supports Kubernetes-style binary suffixes (Ki, Mi, Gi, Ti) and
40/// decimal suffixes (k, M, G, T). Plain numbers are interpreted as bytes.
41pub fn parse_memory_bytes(s: &str) -> Result<u64, String> {
42    let s = s.trim();
43    if s.is_empty() {
44        return Err("empty memory/storage string".to_string());
45    }
46
47    // Binary suffixes (powers of 1024)
48    if let Some(num) = s.strip_suffix("Ti") {
49        let v: f64 = num
50            .parse()
51            .map_err(|_| format!("invalid memory value: '{s}'"))?;
52        return Ok((v * 1024.0 * 1024.0 * 1024.0 * 1024.0) as u64);
53    }
54    if let Some(num) = s.strip_suffix("Gi") {
55        let v: f64 = num
56            .parse()
57            .map_err(|_| format!("invalid memory value: '{s}'"))?;
58        return Ok((v * 1024.0 * 1024.0 * 1024.0) as u64);
59    }
60    if let Some(num) = s.strip_suffix("Mi") {
61        let v: f64 = num
62            .parse()
63            .map_err(|_| format!("invalid memory value: '{s}'"))?;
64        return Ok((v * 1024.0 * 1024.0) as u64);
65    }
66    if let Some(num) = s.strip_suffix("Ki") {
67        let v: f64 = num
68            .parse()
69            .map_err(|_| format!("invalid memory value: '{s}'"))?;
70        return Ok((v * 1024.0) as u64);
71    }
72
73    // Decimal suffixes (powers of 1000)
74    if let Some(num) = s.strip_suffix('T') {
75        let v: f64 = num
76            .parse()
77            .map_err(|_| format!("invalid memory value: '{s}'"))?;
78        return Ok((v * 1_000_000_000_000.0) as u64);
79    }
80    if let Some(num) = s.strip_suffix('G') {
81        let v: f64 = num
82            .parse()
83            .map_err(|_| format!("invalid memory value: '{s}'"))?;
84        return Ok((v * 1_000_000_000.0) as u64);
85    }
86    if let Some(num) = s.strip_suffix('M') {
87        let v: f64 = num
88            .parse()
89            .map_err(|_| format!("invalid memory value: '{s}'"))?;
90        return Ok((v * 1_000_000.0) as u64);
91    }
92    if let Some(num) = s.strip_suffix('k') {
93        let v: f64 = num
94            .parse()
95            .map_err(|_| format!("invalid memory value: '{s}'"))?;
96        return Ok((v * 1000.0) as u64);
97    }
98
99    // Plain bytes
100    s.parse()
101        .map_err(|_| format!("invalid memory value: '{s}'"))
102}
103
104// ---------------------------------------------------------------------------
105// Instance type catalog
106// ---------------------------------------------------------------------------
107
108/// Instance family classification.
109#[derive(Debug, Clone, Copy, PartialEq, Eq)]
110pub enum InstanceFamily {
111    Burstable,
112    GeneralPurpose,
113    ComputeOptimized,
114    MemoryOptimized,
115    StorageOptimized,
116    GpuCompute,
117}
118
119/// CPU architecture.
120#[derive(Debug, Clone, Copy, PartialEq, Eq)]
121pub enum Architecture {
122    Arm64,
123    X86_64,
124}
125
126/// Static GPU specification for catalog entries (no heap allocation).
127#[derive(Debug, Clone, Copy, PartialEq, Eq)]
128pub struct CatalogGpu {
129    pub gpu_type: &'static str,
130    pub count: u32,
131}
132
133/// A known instance type with its hardware specifications.
134///
135/// All fields are compile-time constants. The catalog is a flat array of these.
136#[derive(Debug, Clone)]
137pub struct InstanceTypeSpec {
138    pub name: &'static str,
139    pub platform: Platform,
140    pub family: InstanceFamily,
141    pub architecture: Architecture,
142    /// vCPU count (hardware total)
143    pub vcpu: u32,
144    /// Memory in bytes (hardware total)
145    pub memory_bytes: u64,
146    /// Ephemeral storage in bytes (hardware total, NVMe for storage-optimized)
147    pub ephemeral_storage_bytes: u64,
148    /// GPU specification (for GPU instances)
149    pub gpu: Option<CatalogGpu>,
150}
151
152impl InstanceTypeSpec {
153    /// Convert this catalog entry into a `MachineProfile` for use in `CapacityGroup`.
154    pub fn to_machine_profile(&self) -> MachineProfile {
155        MachineProfile {
156            cpu: format!("{}.0", self.vcpu),
157            memory_bytes: self.memory_bytes,
158            ephemeral_storage_bytes: self.ephemeral_storage_bytes,
159            gpu: self.gpu.map(|g| GpuSpec {
160                gpu_type: g.gpu_type.to_string(),
161                count: g.count,
162            }),
163        }
164    }
165}
166
167// Helpers for readable byte constants
168const KI: u64 = 1024;
169const MI: u64 = KI * 1024;
170const GI: u64 = MI * 1024;
171
172/// The complete instance type catalog.
173///
174/// This is the single source of truth for instance type specifications.
175/// Update this array when adding support for new instance types.
176///
177/// NOTE: Ephemeral storage values for non-NVMe instances are conservative defaults
178/// (EBS-backed root volumes). Storage-optimized instances list their NVMe capacity.
179static CATALOG: &[InstanceTypeSpec] = &[
180    // =========================================================================
181    // AWS — ARM (Graviton) preferred for cost efficiency
182    // =========================================================================
183
184    // Burstable (t4g — ARM Graviton2)
185    InstanceTypeSpec {
186        name: "t4g.micro",
187        platform: Platform::Aws,
188        family: InstanceFamily::Burstable,
189        architecture: Architecture::Arm64,
190        vcpu: 2,
191        memory_bytes: 1 * GI,
192        ephemeral_storage_bytes: 20 * GI,
193        gpu: None,
194    },
195    InstanceTypeSpec {
196        name: "t4g.small",
197        platform: Platform::Aws,
198        family: InstanceFamily::Burstable,
199        architecture: Architecture::Arm64,
200        vcpu: 2,
201        memory_bytes: 2 * GI,
202        ephemeral_storage_bytes: 20 * GI,
203        gpu: None,
204    },
205    InstanceTypeSpec {
206        name: "t4g.medium",
207        platform: Platform::Aws,
208        family: InstanceFamily::Burstable,
209        architecture: Architecture::Arm64,
210        vcpu: 2,
211        memory_bytes: 4 * GI,
212        ephemeral_storage_bytes: 20 * GI,
213        gpu: None,
214    },
215    InstanceTypeSpec {
216        name: "t4g.large",
217        platform: Platform::Aws,
218        family: InstanceFamily::Burstable,
219        architecture: Architecture::Arm64,
220        vcpu: 2,
221        memory_bytes: 8 * GI,
222        ephemeral_storage_bytes: 20 * GI,
223        gpu: None,
224    },
225    InstanceTypeSpec {
226        name: "t4g.xlarge",
227        platform: Platform::Aws,
228        family: InstanceFamily::Burstable,
229        architecture: Architecture::Arm64,
230        vcpu: 4,
231        memory_bytes: 16 * GI,
232        ephemeral_storage_bytes: 20 * GI,
233        gpu: None,
234    },
235    // General Purpose (m7g — ARM Graviton3, up to 2xlarge / 8 vCPU)
236    InstanceTypeSpec {
237        name: "m7g.medium",
238        platform: Platform::Aws,
239        family: InstanceFamily::GeneralPurpose,
240        architecture: Architecture::Arm64,
241        vcpu: 1,
242        memory_bytes: 4 * GI,
243        ephemeral_storage_bytes: 20 * GI,
244        gpu: None,
245    },
246    InstanceTypeSpec {
247        name: "m7g.large",
248        platform: Platform::Aws,
249        family: InstanceFamily::GeneralPurpose,
250        architecture: Architecture::Arm64,
251        vcpu: 2,
252        memory_bytes: 8 * GI,
253        ephemeral_storage_bytes: 20 * GI,
254        gpu: None,
255    },
256    InstanceTypeSpec {
257        name: "m7g.xlarge",
258        platform: Platform::Aws,
259        family: InstanceFamily::GeneralPurpose,
260        architecture: Architecture::Arm64,
261        vcpu: 4,
262        memory_bytes: 16 * GI,
263        ephemeral_storage_bytes: 20 * GI,
264        gpu: None,
265    },
266    InstanceTypeSpec {
267        name: "m7g.2xlarge",
268        platform: Platform::Aws,
269        family: InstanceFamily::GeneralPurpose,
270        architecture: Architecture::Arm64,
271        vcpu: 8,
272        memory_bytes: 32 * GI,
273        ephemeral_storage_bytes: 20 * GI,
274        gpu: None,
275    },
276    InstanceTypeSpec {
277        name: "m7g.4xlarge",
278        platform: Platform::Aws,
279        family: InstanceFamily::GeneralPurpose,
280        architecture: Architecture::Arm64,
281        vcpu: 16,
282        memory_bytes: 64 * GI,
283        ephemeral_storage_bytes: 20 * GI,
284        gpu: None,
285    },
286    // Compute Optimized (c7g — ARM Graviton3, up to 2xlarge / 8 vCPU)
287    InstanceTypeSpec {
288        name: "c7g.medium",
289        platform: Platform::Aws,
290        family: InstanceFamily::ComputeOptimized,
291        architecture: Architecture::Arm64,
292        vcpu: 1,
293        memory_bytes: 2 * GI,
294        ephemeral_storage_bytes: 20 * GI,
295        gpu: None,
296    },
297    InstanceTypeSpec {
298        name: "c7g.large",
299        platform: Platform::Aws,
300        family: InstanceFamily::ComputeOptimized,
301        architecture: Architecture::Arm64,
302        vcpu: 2,
303        memory_bytes: 4 * GI,
304        ephemeral_storage_bytes: 20 * GI,
305        gpu: None,
306    },
307    InstanceTypeSpec {
308        name: "c7g.xlarge",
309        platform: Platform::Aws,
310        family: InstanceFamily::ComputeOptimized,
311        architecture: Architecture::Arm64,
312        vcpu: 4,
313        memory_bytes: 8 * GI,
314        ephemeral_storage_bytes: 20 * GI,
315        gpu: None,
316    },
317    InstanceTypeSpec {
318        name: "c7g.2xlarge",
319        platform: Platform::Aws,
320        family: InstanceFamily::ComputeOptimized,
321        architecture: Architecture::Arm64,
322        vcpu: 8,
323        memory_bytes: 16 * GI,
324        ephemeral_storage_bytes: 20 * GI,
325        gpu: None,
326    },
327    InstanceTypeSpec {
328        name: "c7g.4xlarge",
329        platform: Platform::Aws,
330        family: InstanceFamily::ComputeOptimized,
331        architecture: Architecture::Arm64,
332        vcpu: 16,
333        memory_bytes: 32 * GI,
334        ephemeral_storage_bytes: 20 * GI,
335        gpu: None,
336    },
337    // Memory Optimized (r7g — ARM Graviton3, up to 2xlarge / 8 vCPU)
338    InstanceTypeSpec {
339        name: "r7g.medium",
340        platform: Platform::Aws,
341        family: InstanceFamily::MemoryOptimized,
342        architecture: Architecture::Arm64,
343        vcpu: 1,
344        memory_bytes: 8 * GI,
345        ephemeral_storage_bytes: 20 * GI,
346        gpu: None,
347    },
348    InstanceTypeSpec {
349        name: "r7g.large",
350        platform: Platform::Aws,
351        family: InstanceFamily::MemoryOptimized,
352        architecture: Architecture::Arm64,
353        vcpu: 2,
354        memory_bytes: 16 * GI,
355        ephemeral_storage_bytes: 20 * GI,
356        gpu: None,
357    },
358    InstanceTypeSpec {
359        name: "r7g.xlarge",
360        platform: Platform::Aws,
361        family: InstanceFamily::MemoryOptimized,
362        architecture: Architecture::Arm64,
363        vcpu: 4,
364        memory_bytes: 32 * GI,
365        ephemeral_storage_bytes: 20 * GI,
366        gpu: None,
367    },
368    InstanceTypeSpec {
369        name: "r7g.2xlarge",
370        platform: Platform::Aws,
371        family: InstanceFamily::MemoryOptimized,
372        architecture: Architecture::Arm64,
373        vcpu: 8,
374        memory_bytes: 64 * GI,
375        ephemeral_storage_bytes: 20 * GI,
376        gpu: None,
377    },
378    InstanceTypeSpec {
379        name: "r7g.4xlarge",
380        platform: Platform::Aws,
381        family: InstanceFamily::MemoryOptimized,
382        architecture: Architecture::Arm64,
383        vcpu: 16,
384        memory_bytes: 128 * GI,
385        ephemeral_storage_bytes: 20 * GI,
386        gpu: None,
387    },
388    // Storage Optimized (i4i — x86_64, NVMe)
389    InstanceTypeSpec {
390        name: "i4i.xlarge",
391        platform: Platform::Aws,
392        family: InstanceFamily::StorageOptimized,
393        architecture: Architecture::X86_64,
394        vcpu: 4,
395        memory_bytes: 32 * GI,
396        ephemeral_storage_bytes: 937 * GI,
397        gpu: None,
398    },
399    InstanceTypeSpec {
400        name: "i4i.2xlarge",
401        platform: Platform::Aws,
402        family: InstanceFamily::StorageOptimized,
403        architecture: Architecture::X86_64,
404        vcpu: 8,
405        memory_bytes: 64 * GI,
406        ephemeral_storage_bytes: 1875 * GI,
407        gpu: None,
408    },
409    InstanceTypeSpec {
410        name: "i4i.4xlarge",
411        platform: Platform::Aws,
412        family: InstanceFamily::StorageOptimized,
413        architecture: Architecture::X86_64,
414        vcpu: 16,
415        memory_bytes: 128 * GI,
416        ephemeral_storage_bytes: 3750 * GI,
417        gpu: None,
418    },
419    InstanceTypeSpec {
420        name: "i4i.8xlarge",
421        platform: Platform::Aws,
422        family: InstanceFamily::StorageOptimized,
423        architecture: Architecture::X86_64,
424        vcpu: 32,
425        memory_bytes: 256 * GI,
426        ephemeral_storage_bytes: 7500 * GI,
427        gpu: None,
428    },
429    // GPU — NVIDIA T4 (g5 — x86_64)
430    InstanceTypeSpec {
431        name: "g5.xlarge",
432        platform: Platform::Aws,
433        family: InstanceFamily::GpuCompute,
434        architecture: Architecture::X86_64,
435        vcpu: 4,
436        memory_bytes: 16 * GI,
437        ephemeral_storage_bytes: 250 * GI,
438        gpu: Some(CatalogGpu {
439            gpu_type: "nvidia-t4",
440            count: 1,
441        }),
442    },
443    InstanceTypeSpec {
444        name: "g5.2xlarge",
445        platform: Platform::Aws,
446        family: InstanceFamily::GpuCompute,
447        architecture: Architecture::X86_64,
448        vcpu: 8,
449        memory_bytes: 32 * GI,
450        ephemeral_storage_bytes: 450 * GI,
451        gpu: Some(CatalogGpu {
452            gpu_type: "nvidia-t4",
453            count: 1,
454        }),
455    },
456    // GPU — NVIDIA A100 (p4d — x86_64)
457    InstanceTypeSpec {
458        name: "p4d.24xlarge",
459        platform: Platform::Aws,
460        family: InstanceFamily::GpuCompute,
461        architecture: Architecture::X86_64,
462        vcpu: 96,
463        memory_bytes: 1152 * GI,
464        ephemeral_storage_bytes: 8000 * GI,
465        gpu: Some(CatalogGpu {
466            gpu_type: "nvidia-a100",
467            count: 8,
468        }),
469    },
470    // GPU — NVIDIA H100 (p5 — x86_64)
471    InstanceTypeSpec {
472        name: "p5.48xlarge",
473        platform: Platform::Aws,
474        family: InstanceFamily::GpuCompute,
475        architecture: Architecture::X86_64,
476        vcpu: 192,
477        memory_bytes: 2048 * GI,
478        ephemeral_storage_bytes: 8000 * GI,
479        gpu: Some(CatalogGpu {
480            gpu_type: "nvidia-h100",
481            count: 8,
482        }),
483    },
484    // =========================================================================
485    // GCP
486    // =========================================================================
487
488    // Burstable (e2)
489    InstanceTypeSpec {
490        name: "e2-micro",
491        platform: Platform::Gcp,
492        family: InstanceFamily::Burstable,
493        architecture: Architecture::X86_64,
494        vcpu: 2,
495        memory_bytes: 1 * GI,
496        ephemeral_storage_bytes: 20 * GI,
497        gpu: None,
498    },
499    InstanceTypeSpec {
500        name: "e2-small",
501        platform: Platform::Gcp,
502        family: InstanceFamily::Burstable,
503        architecture: Architecture::X86_64,
504        vcpu: 2,
505        memory_bytes: 2 * GI,
506        ephemeral_storage_bytes: 20 * GI,
507        gpu: None,
508    },
509    InstanceTypeSpec {
510        name: "e2-medium",
511        platform: Platform::Gcp,
512        family: InstanceFamily::Burstable,
513        architecture: Architecture::X86_64,
514        vcpu: 2,
515        memory_bytes: 4 * GI,
516        ephemeral_storage_bytes: 20 * GI,
517        gpu: None,
518    },
519    // General Purpose (n2-standard, up to 16 vCPU)
520    InstanceTypeSpec {
521        name: "n2-standard-2",
522        platform: Platform::Gcp,
523        family: InstanceFamily::GeneralPurpose,
524        architecture: Architecture::X86_64,
525        vcpu: 2,
526        memory_bytes: 8 * GI,
527        ephemeral_storage_bytes: 20 * GI,
528        gpu: None,
529    },
530    InstanceTypeSpec {
531        name: "n2-standard-4",
532        platform: Platform::Gcp,
533        family: InstanceFamily::GeneralPurpose,
534        architecture: Architecture::X86_64,
535        vcpu: 4,
536        memory_bytes: 16 * GI,
537        ephemeral_storage_bytes: 20 * GI,
538        gpu: None,
539    },
540    InstanceTypeSpec {
541        name: "n2-standard-8",
542        platform: Platform::Gcp,
543        family: InstanceFamily::GeneralPurpose,
544        architecture: Architecture::X86_64,
545        vcpu: 8,
546        memory_bytes: 32 * GI,
547        ephemeral_storage_bytes: 20 * GI,
548        gpu: None,
549    },
550    InstanceTypeSpec {
551        name: "n2-standard-16",
552        platform: Platform::Gcp,
553        family: InstanceFamily::GeneralPurpose,
554        architecture: Architecture::X86_64,
555        vcpu: 16,
556        memory_bytes: 64 * GI,
557        ephemeral_storage_bytes: 20 * GI,
558        gpu: None,
559    },
560    // Compute Optimized (c3-standard, up to 8 vCPU)
561    InstanceTypeSpec {
562        name: "c3-standard-4",
563        platform: Platform::Gcp,
564        family: InstanceFamily::ComputeOptimized,
565        architecture: Architecture::X86_64,
566        vcpu: 4,
567        memory_bytes: 8 * GI,
568        ephemeral_storage_bytes: 20 * GI,
569        gpu: None,
570    },
571    InstanceTypeSpec {
572        name: "c3-standard-8",
573        platform: Platform::Gcp,
574        family: InstanceFamily::ComputeOptimized,
575        architecture: Architecture::X86_64,
576        vcpu: 8,
577        memory_bytes: 16 * GI,
578        ephemeral_storage_bytes: 20 * GI,
579        gpu: None,
580    },
581    // Memory Optimized (n2-highmem, up to 8 vCPU)
582    InstanceTypeSpec {
583        name: "n2-highmem-2",
584        platform: Platform::Gcp,
585        family: InstanceFamily::MemoryOptimized,
586        architecture: Architecture::X86_64,
587        vcpu: 2,
588        memory_bytes: 16 * GI,
589        ephemeral_storage_bytes: 20 * GI,
590        gpu: None,
591    },
592    InstanceTypeSpec {
593        name: "n2-highmem-4",
594        platform: Platform::Gcp,
595        family: InstanceFamily::MemoryOptimized,
596        architecture: Architecture::X86_64,
597        vcpu: 4,
598        memory_bytes: 32 * GI,
599        ephemeral_storage_bytes: 20 * GI,
600        gpu: None,
601    },
602    InstanceTypeSpec {
603        name: "n2-highmem-8",
604        platform: Platform::Gcp,
605        family: InstanceFamily::MemoryOptimized,
606        architecture: Architecture::X86_64,
607        vcpu: 8,
608        memory_bytes: 64 * GI,
609        ephemeral_storage_bytes: 20 * GI,
610        gpu: None,
611    },
612    InstanceTypeSpec {
613        name: "n2-highmem-16",
614        platform: Platform::Gcp,
615        family: InstanceFamily::MemoryOptimized,
616        architecture: Architecture::X86_64,
617        vcpu: 16,
618        memory_bytes: 128 * GI,
619        ephemeral_storage_bytes: 20 * GI,
620        gpu: None,
621    },
622    InstanceTypeSpec {
623        name: "n2-highmem-32",
624        platform: Platform::Gcp,
625        family: InstanceFamily::MemoryOptimized,
626        architecture: Architecture::X86_64,
627        vcpu: 32,
628        memory_bytes: 256 * GI,
629        ephemeral_storage_bytes: 20 * GI,
630        gpu: None,
631    },
632    // Storage Optimized (c3d-standard with local SSD)
633    InstanceTypeSpec {
634        name: "c3d-standard-8",
635        platform: Platform::Gcp,
636        family: InstanceFamily::StorageOptimized,
637        architecture: Architecture::X86_64,
638        vcpu: 8,
639        memory_bytes: 32 * GI,
640        ephemeral_storage_bytes: 480 * GI,
641        gpu: None,
642    },
643    InstanceTypeSpec {
644        name: "c3d-standard-16",
645        platform: Platform::Gcp,
646        family: InstanceFamily::StorageOptimized,
647        architecture: Architecture::X86_64,
648        vcpu: 16,
649        memory_bytes: 64 * GI,
650        ephemeral_storage_bytes: 960 * GI,
651        gpu: None,
652    },
653    InstanceTypeSpec {
654        name: "c3d-standard-30",
655        platform: Platform::Gcp,
656        family: InstanceFamily::StorageOptimized,
657        architecture: Architecture::X86_64,
658        vcpu: 30,
659        memory_bytes: 120 * GI,
660        ephemeral_storage_bytes: 1920 * GI,
661        gpu: None,
662    },
663    // GPU — NVIDIA T4 (n1-standard + T4)
664    InstanceTypeSpec {
665        name: "n1-standard-4-t4",
666        platform: Platform::Gcp,
667        family: InstanceFamily::GpuCompute,
668        architecture: Architecture::X86_64,
669        vcpu: 4,
670        memory_bytes: 15 * GI,
671        ephemeral_storage_bytes: 100 * GI,
672        gpu: Some(CatalogGpu {
673            gpu_type: "nvidia-t4",
674            count: 1,
675        }),
676    },
677    // GPU — NVIDIA A100 (a2-highgpu)
678    InstanceTypeSpec {
679        name: "a2-highgpu-1g",
680        platform: Platform::Gcp,
681        family: InstanceFamily::GpuCompute,
682        architecture: Architecture::X86_64,
683        vcpu: 12,
684        memory_bytes: 85 * GI,
685        ephemeral_storage_bytes: 100 * GI,
686        gpu: Some(CatalogGpu {
687            gpu_type: "nvidia-a100",
688            count: 1,
689        }),
690    },
691    InstanceTypeSpec {
692        name: "a2-highgpu-8g",
693        platform: Platform::Gcp,
694        family: InstanceFamily::GpuCompute,
695        architecture: Architecture::X86_64,
696        vcpu: 96,
697        memory_bytes: 1360 * GI,
698        ephemeral_storage_bytes: 100 * GI,
699        gpu: Some(CatalogGpu {
700            gpu_type: "nvidia-a100",
701            count: 8,
702        }),
703    },
704    // GPU — NVIDIA H100 (a3-highgpu)
705    InstanceTypeSpec {
706        name: "a3-highgpu-8g",
707        platform: Platform::Gcp,
708        family: InstanceFamily::GpuCompute,
709        architecture: Architecture::X86_64,
710        vcpu: 208,
711        memory_bytes: 1872 * GI,
712        ephemeral_storage_bytes: 100 * GI,
713        gpu: Some(CatalogGpu {
714            gpu_type: "nvidia-h100",
715            count: 8,
716        }),
717    },
718    // =========================================================================
719    // Azure
720    // =========================================================================
721
722    // Burstable (B-series v2)
723    InstanceTypeSpec {
724        name: "Standard_B1s",
725        platform: Platform::Azure,
726        family: InstanceFamily::Burstable,
727        architecture: Architecture::X86_64,
728        vcpu: 1,
729        memory_bytes: 1 * GI,
730        ephemeral_storage_bytes: 20 * GI,
731        gpu: None,
732    },
733    InstanceTypeSpec {
734        name: "Standard_B2s",
735        platform: Platform::Azure,
736        family: InstanceFamily::Burstable,
737        architecture: Architecture::X86_64,
738        vcpu: 2,
739        memory_bytes: 4 * GI,
740        ephemeral_storage_bytes: 20 * GI,
741        gpu: None,
742    },
743    InstanceTypeSpec {
744        name: "Standard_B2ms",
745        platform: Platform::Azure,
746        family: InstanceFamily::Burstable,
747        architecture: Architecture::X86_64,
748        vcpu: 2,
749        memory_bytes: 8 * GI,
750        ephemeral_storage_bytes: 20 * GI,
751        gpu: None,
752    },
753    InstanceTypeSpec {
754        name: "Standard_B4ms",
755        platform: Platform::Azure,
756        family: InstanceFamily::Burstable,
757        architecture: Architecture::X86_64,
758        vcpu: 4,
759        memory_bytes: 16 * GI,
760        ephemeral_storage_bytes: 20 * GI,
761        gpu: None,
762    },
763    // General Purpose (Dv5-series, up to 16 vCPU)
764    InstanceTypeSpec {
765        name: "Standard_D2s_v5",
766        platform: Platform::Azure,
767        family: InstanceFamily::GeneralPurpose,
768        architecture: Architecture::X86_64,
769        vcpu: 2,
770        memory_bytes: 8 * GI,
771        ephemeral_storage_bytes: 20 * GI,
772        gpu: None,
773    },
774    InstanceTypeSpec {
775        name: "Standard_D4s_v5",
776        platform: Platform::Azure,
777        family: InstanceFamily::GeneralPurpose,
778        architecture: Architecture::X86_64,
779        vcpu: 4,
780        memory_bytes: 16 * GI,
781        ephemeral_storage_bytes: 20 * GI,
782        gpu: None,
783    },
784    InstanceTypeSpec {
785        name: "Standard_D8s_v5",
786        platform: Platform::Azure,
787        family: InstanceFamily::GeneralPurpose,
788        architecture: Architecture::X86_64,
789        vcpu: 8,
790        memory_bytes: 32 * GI,
791        ephemeral_storage_bytes: 20 * GI,
792        gpu: None,
793    },
794    InstanceTypeSpec {
795        name: "Standard_D16s_v5",
796        platform: Platform::Azure,
797        family: InstanceFamily::GeneralPurpose,
798        architecture: Architecture::X86_64,
799        vcpu: 16,
800        memory_bytes: 64 * GI,
801        ephemeral_storage_bytes: 20 * GI,
802        gpu: None,
803    },
804    // Compute Optimized (Fv2-series, up to 16 vCPU)
805    InstanceTypeSpec {
806        name: "Standard_F2s_v2",
807        platform: Platform::Azure,
808        family: InstanceFamily::ComputeOptimized,
809        architecture: Architecture::X86_64,
810        vcpu: 2,
811        memory_bytes: 4 * GI,
812        ephemeral_storage_bytes: 20 * GI,
813        gpu: None,
814    },
815    InstanceTypeSpec {
816        name: "Standard_F4s_v2",
817        platform: Platform::Azure,
818        family: InstanceFamily::ComputeOptimized,
819        architecture: Architecture::X86_64,
820        vcpu: 4,
821        memory_bytes: 8 * GI,
822        ephemeral_storage_bytes: 20 * GI,
823        gpu: None,
824    },
825    InstanceTypeSpec {
826        name: "Standard_F8s_v2",
827        platform: Platform::Azure,
828        family: InstanceFamily::ComputeOptimized,
829        architecture: Architecture::X86_64,
830        vcpu: 8,
831        memory_bytes: 16 * GI,
832        ephemeral_storage_bytes: 20 * GI,
833        gpu: None,
834    },
835    InstanceTypeSpec {
836        name: "Standard_F16s_v2",
837        platform: Platform::Azure,
838        family: InstanceFamily::ComputeOptimized,
839        architecture: Architecture::X86_64,
840        vcpu: 16,
841        memory_bytes: 32 * GI,
842        ephemeral_storage_bytes: 20 * GI,
843        gpu: None,
844    },
845    // Memory Optimized (Ev5-series, up to 16 vCPU)
846    InstanceTypeSpec {
847        name: "Standard_E2s_v5",
848        platform: Platform::Azure,
849        family: InstanceFamily::MemoryOptimized,
850        architecture: Architecture::X86_64,
851        vcpu: 2,
852        memory_bytes: 16 * GI,
853        ephemeral_storage_bytes: 20 * GI,
854        gpu: None,
855    },
856    InstanceTypeSpec {
857        name: "Standard_E4s_v5",
858        platform: Platform::Azure,
859        family: InstanceFamily::MemoryOptimized,
860        architecture: Architecture::X86_64,
861        vcpu: 4,
862        memory_bytes: 32 * GI,
863        ephemeral_storage_bytes: 20 * GI,
864        gpu: None,
865    },
866    InstanceTypeSpec {
867        name: "Standard_E8s_v5",
868        platform: Platform::Azure,
869        family: InstanceFamily::MemoryOptimized,
870        architecture: Architecture::X86_64,
871        vcpu: 8,
872        memory_bytes: 64 * GI,
873        ephemeral_storage_bytes: 20 * GI,
874        gpu: None,
875    },
876    InstanceTypeSpec {
877        name: "Standard_E16s_v5",
878        platform: Platform::Azure,
879        family: InstanceFamily::MemoryOptimized,
880        architecture: Architecture::X86_64,
881        vcpu: 16,
882        memory_bytes: 128 * GI,
883        ephemeral_storage_bytes: 20 * GI,
884        gpu: None,
885    },
886    // Storage Optimized (Lsv3-series with NVMe)
887    InstanceTypeSpec {
888        name: "Standard_L8s_v3",
889        platform: Platform::Azure,
890        family: InstanceFamily::StorageOptimized,
891        architecture: Architecture::X86_64,
892        vcpu: 8,
893        memory_bytes: 64 * GI,
894        ephemeral_storage_bytes: 1788 * GI,
895        gpu: None,
896    },
897    InstanceTypeSpec {
898        name: "Standard_L16s_v3",
899        platform: Platform::Azure,
900        family: InstanceFamily::StorageOptimized,
901        architecture: Architecture::X86_64,
902        vcpu: 16,
903        memory_bytes: 128 * GI,
904        ephemeral_storage_bytes: 3576 * GI,
905        gpu: None,
906    },
907    InstanceTypeSpec {
908        name: "Standard_L32s_v3",
909        platform: Platform::Azure,
910        family: InstanceFamily::StorageOptimized,
911        architecture: Architecture::X86_64,
912        vcpu: 32,
913        memory_bytes: 256 * GI,
914        ephemeral_storage_bytes: 7154 * GI,
915        gpu: None,
916    },
917    // GPU — NVIDIA T4 (NCasT4_v3-series)
918    InstanceTypeSpec {
919        name: "Standard_NC4as_T4_v3",
920        platform: Platform::Azure,
921        family: InstanceFamily::GpuCompute,
922        architecture: Architecture::X86_64,
923        vcpu: 4,
924        memory_bytes: 28 * GI,
925        ephemeral_storage_bytes: 176 * GI,
926        gpu: Some(CatalogGpu {
927            gpu_type: "nvidia-t4",
928            count: 1,
929        }),
930    },
931    // GPU — NVIDIA A100 (NC A100 v4-series)
932    InstanceTypeSpec {
933        name: "Standard_NC24ads_A100_v4",
934        platform: Platform::Azure,
935        family: InstanceFamily::GpuCompute,
936        architecture: Architecture::X86_64,
937        vcpu: 24,
938        memory_bytes: 220 * GI,
939        ephemeral_storage_bytes: 958 * GI,
940        gpu: Some(CatalogGpu {
941            gpu_type: "nvidia-a100",
942            count: 1,
943        }),
944    },
945    InstanceTypeSpec {
946        name: "Standard_NC96ads_A100_v4",
947        platform: Platform::Azure,
948        family: InstanceFamily::GpuCompute,
949        architecture: Architecture::X86_64,
950        vcpu: 96,
951        memory_bytes: 880 * GI,
952        ephemeral_storage_bytes: 3916 * GI,
953        gpu: Some(CatalogGpu {
954            gpu_type: "nvidia-a100",
955            count: 4,
956        }),
957    },
958    // GPU — NVIDIA H100 (ND H100 v5-series)
959    InstanceTypeSpec {
960        name: "Standard_ND96isr_H100_v5",
961        platform: Platform::Azure,
962        family: InstanceFamily::GpuCompute,
963        architecture: Architecture::X86_64,
964        vcpu: 96,
965        memory_bytes: 1900 * GI,
966        ephemeral_storage_bytes: 1000 * GI,
967        gpu: Some(CatalogGpu {
968            gpu_type: "nvidia-h100",
969            count: 8,
970        }),
971    },
972];
973
974// ---------------------------------------------------------------------------
975// Catalog lookup
976// ---------------------------------------------------------------------------
977
978/// Get all instance types for a given platform.
979pub fn catalog_for_platform(platform: Platform) -> Vec<&'static InstanceTypeSpec> {
980    CATALOG
981        .iter()
982        .filter(|spec| spec.platform == platform)
983        .collect()
984}
985
986/// Find a specific instance type by name and platform.
987pub fn find_instance_type(platform: Platform, name: &str) -> Option<&'static InstanceTypeSpec> {
988    CATALOG
989        .iter()
990        .find(|spec| spec.platform == platform && spec.name == name)
991}
992
993// ---------------------------------------------------------------------------
994// Instance type selection
995// ---------------------------------------------------------------------------
996
997/// Aggregated resource requirements from all containers in a capacity group.
998#[derive(Debug, Clone)]
999pub struct WorkloadRequirements {
1000    /// Total CPU needed at maximum scale (sum of desired CPU * max_replicas per container)
1001    pub total_cpu_at_max: f64,
1002    /// Total memory needed at maximum scale (sum of desired memory * max_replicas per container)
1003    pub total_memory_bytes_at_max: u64,
1004    /// Largest CPU request among all individual containers (single replica)
1005    pub max_cpu_per_container: f64,
1006    /// Largest memory request among all individual containers (single replica)
1007    pub max_memory_per_container: u64,
1008    /// Maximum ephemeral storage any single container requires
1009    pub max_ephemeral_storage_bytes: u64,
1010    /// GPU requirement (if any container needs GPU)
1011    pub gpu: Option<GpuSpec>,
1012}
1013
1014/// Result of instance type selection.
1015#[derive(Debug, Clone)]
1016pub struct InstanceSelection {
1017    /// Selected instance type name (e.g., "m7g.2xlarge")
1018    pub instance_type: &'static str,
1019    /// Machine profile derived from the instance type
1020    pub profile: MachineProfile,
1021    /// Recommended minimum number of machines
1022    pub min_machines: u32,
1023    /// Recommended maximum number of machines
1024    pub max_machines: u32,
1025}
1026
1027/// Ephemeral storage threshold above which storage-optimized instances are selected.
1028const STORAGE_OPTIMIZED_THRESHOLD: u64 = 200 * GI;
1029
1030/// Maximum number of machines per cluster (Horizon limit).
1031const MAX_MACHINES_PER_CLUSTER: u32 = 10;
1032
1033/// Hard cap on vCPUs for non-GPU/non-storage workloads. Equivalent to AWS 2xlarge.
1034/// Beyond this, horizontal scaling is always preferred over bigger machines.
1035const MAX_STANDARD_VCPU: u32 = 8;
1036
1037/// How many of the largest container we want to fit per machine (for bin-packing).
1038const CONTAINERS_PER_MACHINE: f64 = 4.0;
1039
1040/// Overhead factor for system processes and bin-packing inefficiency.
1041const OVERHEAD_FACTOR: f64 = 1.25;
1042
1043/// Select the best instance type for a workload on a given platform.
1044///
1045/// The algorithm:
1046/// 1. GPU workloads: Match by GPU type, find smallest instance with enough GPUs.
1047/// 2. Storage-heavy workloads (>200Gi ephemeral): Use storage-optimized instances.
1048/// 3. All other workloads: Size the machine to fit ~4 of the largest container
1049///    with overhead, capped at 8 vCPUs. Use GeneralPurpose family for broad
1050///    availability and reasonable cost. Scale horizontally for more capacity.
1051///
1052/// Returns an error if no suitable instance type is found.
1053pub fn select_instance_type(
1054    platform: Platform,
1055    requirements: &WorkloadRequirements,
1056) -> Result<InstanceSelection, String> {
1057    // Determine which family to use
1058    let family = select_family(requirements);
1059
1060    // Filter catalog to matching platform + family
1061    let candidates: Vec<&InstanceTypeSpec> = CATALOG
1062        .iter()
1063        .filter(|spec| spec.platform == platform && spec.family == family)
1064        .collect();
1065
1066    if candidates.is_empty() {
1067        return Err(format!(
1068            "no {family:?} instance types in catalog for platform {platform}"
1069        ));
1070    }
1071
1072    // For GPU workloads, filter by GPU type
1073    let candidates = if let Some(ref gpu) = requirements.gpu {
1074        let filtered: Vec<&InstanceTypeSpec> = candidates
1075            .into_iter()
1076            .filter(|spec| {
1077                spec.gpu.as_ref().map_or(false, |g| {
1078                    g.gpu_type == gpu.gpu_type && g.count >= gpu.count
1079                })
1080            })
1081            .collect();
1082        if filtered.is_empty() {
1083            return Err(format!(
1084                "no instance type for GPU type '{}' x{} on platform {platform}",
1085                gpu.gpu_type, gpu.count
1086            ));
1087        }
1088        filtered
1089    } else {
1090        candidates
1091    };
1092
1093    // For storage workloads, filter by ephemeral storage capacity
1094    let candidates = if family == InstanceFamily::StorageOptimized {
1095        let filtered: Vec<&InstanceTypeSpec> = candidates
1096            .into_iter()
1097            .filter(|spec| spec.ephemeral_storage_bytes >= requirements.max_ephemeral_storage_bytes)
1098            .collect();
1099        if filtered.is_empty() {
1100            return Err(format!(
1101                "no storage-optimized instance with >= {} bytes ephemeral storage on platform {platform}",
1102                requirements.max_ephemeral_storage_bytes
1103            ));
1104        }
1105        filtered
1106    } else {
1107        candidates
1108    };
1109
1110    // Size the instance based on the largest single container, not total workload.
1111    // Target: fit ~4 of the largest container per machine with overhead.
1112    let target_cpu =
1113        (requirements.max_cpu_per_container * CONTAINERS_PER_MACHINE * OVERHEAD_FACTOR).max(0.25);
1114    let target_memory =
1115        (requirements.max_memory_per_container as f64 * CONTAINERS_PER_MACHINE * OVERHEAD_FACTOR)
1116            .max(256.0 * MI as f64);
1117
1118    // Cap at MAX_STANDARD_VCPU for non-GPU/non-storage workloads
1119    let vcpu_cap =
1120        if family == InstanceFamily::GpuCompute || family == InstanceFamily::StorageOptimized {
1121            u32::MAX
1122        } else {
1123            MAX_STANDARD_VCPU
1124        };
1125
1126    // Find the smallest instance that meets per-container targets within the cap.
1127    let selected = candidates
1128        .iter()
1129        .filter(|spec| {
1130            spec.vcpu <= vcpu_cap
1131                && spec.vcpu as f64 >= target_cpu
1132                && spec.memory_bytes as f64 >= target_memory
1133        })
1134        .min_by_key(|spec| spec.vcpu)
1135        .or_else(|| {
1136            // If nothing fits within the cap, pick the largest instance under the cap
1137            candidates
1138                .iter()
1139                .filter(|spec| spec.vcpu <= vcpu_cap)
1140                .max_by_key(|spec| spec.vcpu)
1141        })
1142        .or_else(|| {
1143            // Last resort: pick the smallest available instance (for GPU/storage)
1144            candidates.iter().min_by_key(|spec| spec.vcpu)
1145        })
1146        .ok_or_else(|| format!("no instance types available for platform {platform}"))?;
1147
1148    // Calculate machine counts
1149    let max_machines = compute_max_machines(requirements, selected);
1150    let min_machines = compute_min_machines(max_machines);
1151
1152    Ok(InstanceSelection {
1153        instance_type: selected.name,
1154        profile: selected.to_machine_profile(),
1155        min_machines,
1156        max_machines,
1157    })
1158}
1159
1160/// Select instance family based on workload characteristics.
1161///
1162/// Uses GeneralPurpose for all standard workloads — widely available across
1163/// regions and cost-effective. Only specialized workloads (GPU, large ephemeral
1164/// storage) get specialized families. Very small workloads get burstable.
1165pub fn select_family(requirements: &WorkloadRequirements) -> InstanceFamily {
1166    // GPU workloads always get GPU instances
1167    if requirements.gpu.is_some() {
1168        return InstanceFamily::GpuCompute;
1169    }
1170
1171    // Large ephemeral storage needs NVMe (storage-optimized)
1172    if requirements.max_ephemeral_storage_bytes > STORAGE_OPTIMIZED_THRESHOLD {
1173        return InstanceFamily::StorageOptimized;
1174    }
1175
1176    // Very small workloads use burstable instances
1177    if requirements.total_cpu_at_max < 2.0 {
1178        return InstanceFamily::Burstable;
1179    }
1180
1181    // All other workloads use GeneralPurpose — available everywhere, good pricing
1182    InstanceFamily::GeneralPurpose
1183}
1184
1185/// Calculate maximum machines needed to fit the workload with headroom.
1186fn compute_max_machines(requirements: &WorkloadRequirements, instance: &InstanceTypeSpec) -> u32 {
1187    // How many machines to fit total CPU at max scale (with 25% headroom)
1188    let cpu_with_headroom = requirements.total_cpu_at_max * 1.25;
1189    let cpu_machines = (cpu_with_headroom / instance.vcpu as f64).ceil() as u32;
1190
1191    // How many machines to fit total memory at max scale (with 25% headroom)
1192    let mem_with_headroom = requirements.total_memory_bytes_at_max as f64 * 1.25;
1193    let mem_machines = (mem_with_headroom / instance.memory_bytes as f64).ceil() as u32;
1194
1195    // Take the larger of CPU-based and memory-based, clamped to cluster limit
1196    cpu_machines
1197        .max(mem_machines)
1198        .max(1)
1199        .min(MAX_MACHINES_PER_CLUSTER)
1200}
1201
1202/// Calculate minimum machines for HA.
1203fn compute_min_machines(max_machines: u32) -> u32 {
1204    // At least 1, at most 2 for HA (larger min for larger clusters)
1205    if max_machines >= 3 {
1206        2
1207    } else {
1208        1
1209    }
1210}
1211
1212// ---------------------------------------------------------------------------
1213// Tests
1214// ---------------------------------------------------------------------------
1215
1216#[cfg(test)]
1217mod tests {
1218    use super::*;
1219
1220    // -- Parsing tests --
1221
1222    #[test]
1223    fn test_parse_cpu_plain() {
1224        assert_eq!(parse_cpu("1").unwrap(), 1.0);
1225        assert_eq!(parse_cpu("0.5").unwrap(), 0.5);
1226        assert_eq!(parse_cpu("2.0").unwrap(), 2.0);
1227        assert_eq!(parse_cpu("16").unwrap(), 16.0);
1228    }
1229
1230    #[test]
1231    fn test_parse_cpu_millicore() {
1232        assert_eq!(parse_cpu("500m").unwrap(), 0.5);
1233        assert_eq!(parse_cpu("250m").unwrap(), 0.25);
1234        assert_eq!(parse_cpu("1000m").unwrap(), 1.0);
1235        assert_eq!(parse_cpu("100m").unwrap(), 0.1);
1236    }
1237
1238    #[test]
1239    fn test_parse_cpu_invalid() {
1240        assert!(parse_cpu("").is_err());
1241        assert!(parse_cpu("abc").is_err());
1242        assert!(parse_cpu("m").is_err());
1243    }
1244
1245    #[test]
1246    fn test_parse_memory_binary_suffixes() {
1247        assert_eq!(parse_memory_bytes("1Ki").unwrap(), 1024);
1248        assert_eq!(parse_memory_bytes("1Mi").unwrap(), 1024 * 1024);
1249        assert_eq!(parse_memory_bytes("1Gi").unwrap(), 1024 * 1024 * 1024);
1250        assert_eq!(parse_memory_bytes("4Gi").unwrap(), 4 * 1024 * 1024 * 1024);
1251        assert_eq!(parse_memory_bytes("512Mi").unwrap(), 512 * 1024 * 1024);
1252        assert_eq!(
1253            parse_memory_bytes("1Ti").unwrap(),
1254            1024u64 * 1024 * 1024 * 1024
1255        );
1256    }
1257
1258    #[test]
1259    fn test_parse_memory_decimal_suffixes() {
1260        assert_eq!(parse_memory_bytes("1k").unwrap(), 1000);
1261        assert_eq!(parse_memory_bytes("1M").unwrap(), 1_000_000);
1262        assert_eq!(parse_memory_bytes("1G").unwrap(), 1_000_000_000);
1263        assert_eq!(parse_memory_bytes("1T").unwrap(), 1_000_000_000_000);
1264    }
1265
1266    #[test]
1267    fn test_parse_memory_plain_bytes() {
1268        assert_eq!(parse_memory_bytes("1024").unwrap(), 1024);
1269        assert_eq!(parse_memory_bytes("0").unwrap(), 0);
1270    }
1271
1272    #[test]
1273    fn test_parse_memory_invalid() {
1274        assert!(parse_memory_bytes("").is_err());
1275        assert!(parse_memory_bytes("abc").is_err());
1276        assert!(parse_memory_bytes("Gi").is_err());
1277    }
1278
1279    #[test]
1280    fn test_parse_memory_fractional() {
1281        assert_eq!(parse_memory_bytes("0.5Gi").unwrap(), GI / 2);
1282        assert_eq!(parse_memory_bytes("1.5Gi").unwrap(), GI + GI / 2);
1283    }
1284
1285    // -- Catalog lookup tests --
1286
1287    #[test]
1288    fn test_catalog_has_entries_for_all_cloud_platforms() {
1289        assert!(!catalog_for_platform(Platform::Aws).is_empty());
1290        assert!(!catalog_for_platform(Platform::Gcp).is_empty());
1291        assert!(!catalog_for_platform(Platform::Azure).is_empty());
1292    }
1293
1294    #[test]
1295    fn test_catalog_no_entries_for_non_cloud_platforms() {
1296        assert!(catalog_for_platform(Platform::Local).is_empty());
1297        assert!(catalog_for_platform(Platform::Kubernetes).is_empty());
1298    }
1299
1300    #[test]
1301    fn test_find_known_instance_type() {
1302        let spec =
1303            find_instance_type(Platform::Aws, "m7g.2xlarge").expect("should find m7g.2xlarge");
1304        assert_eq!(spec.vcpu, 8);
1305        assert_eq!(spec.memory_bytes, 32 * GI);
1306        assert_eq!(spec.family, InstanceFamily::GeneralPurpose);
1307    }
1308
1309    #[test]
1310    fn test_find_unknown_instance_type() {
1311        assert!(find_instance_type(Platform::Aws, "nonexistent.xlarge").is_none());
1312    }
1313
1314    #[test]
1315    fn test_find_wrong_platform() {
1316        assert!(find_instance_type(Platform::Gcp, "m7g.2xlarge").is_none());
1317    }
1318
1319    #[test]
1320    fn test_to_machine_profile() {
1321        let spec = find_instance_type(Platform::Aws, "m7g.2xlarge").unwrap();
1322        let profile = spec.to_machine_profile();
1323        assert_eq!(profile.cpu, "8.0");
1324        assert_eq!(profile.memory_bytes, 32 * GI);
1325        assert_eq!(profile.ephemeral_storage_bytes, 20 * GI);
1326        assert!(profile.gpu.is_none());
1327    }
1328
1329    #[test]
1330    fn test_to_machine_profile_with_gpu() {
1331        let spec = find_instance_type(Platform::Aws, "p4d.24xlarge").unwrap();
1332        let profile = spec.to_machine_profile();
1333        let gpu = profile.gpu.as_ref().expect("should have GPU");
1334        assert_eq!(gpu.gpu_type, "nvidia-a100");
1335        assert_eq!(gpu.count, 8);
1336    }
1337
1338    // -- Selection algorithm tests --
1339
1340    #[test]
1341    fn test_select_burstable_for_small_workload() {
1342        let req = WorkloadRequirements {
1343            total_cpu_at_max: 1.0,
1344            total_memory_bytes_at_max: 2 * GI,
1345            max_cpu_per_container: 0.5,
1346            max_memory_per_container: 1 * GI,
1347            max_ephemeral_storage_bytes: 10 * GI,
1348            gpu: None,
1349        };
1350        let sel = select_instance_type(Platform::Aws, &req).unwrap();
1351        let spec = find_instance_type(Platform::Aws, sel.instance_type).unwrap();
1352        assert_eq!(spec.family, InstanceFamily::Burstable);
1353    }
1354
1355    #[test]
1356    fn test_select_general_purpose_for_standard_workload() {
1357        // Standard workloads always get GeneralPurpose regardless of CPU:memory ratio
1358        let req = WorkloadRequirements {
1359            total_cpu_at_max: 20.0,
1360            total_memory_bytes_at_max: 80 * GI,
1361            max_cpu_per_container: 2.0,
1362            max_memory_per_container: 8 * GI,
1363            max_ephemeral_storage_bytes: 10 * GI,
1364            gpu: None,
1365        };
1366        let sel = select_instance_type(Platform::Aws, &req).unwrap();
1367        let spec = find_instance_type(Platform::Aws, sel.instance_type).unwrap();
1368        assert_eq!(spec.family, InstanceFamily::GeneralPurpose);
1369    }
1370
1371    #[test]
1372    fn test_select_general_purpose_even_for_cpu_heavy() {
1373        // CPU-heavy workloads still get GeneralPurpose (no more ComputeOptimized auto-select)
1374        let req = WorkloadRequirements {
1375            total_cpu_at_max: 20.0,
1376            total_memory_bytes_at_max: 20 * GI,
1377            max_cpu_per_container: 2.0,
1378            max_memory_per_container: 2 * GI,
1379            max_ephemeral_storage_bytes: 10 * GI,
1380            gpu: None,
1381        };
1382        let sel = select_instance_type(Platform::Aws, &req).unwrap();
1383        let spec = find_instance_type(Platform::Aws, sel.instance_type).unwrap();
1384        assert_eq!(spec.family, InstanceFamily::GeneralPurpose);
1385    }
1386
1387    #[test]
1388    fn test_select_storage_optimized_for_large_ephemeral() {
1389        let req = WorkloadRequirements {
1390            total_cpu_at_max: 8.0,
1391            total_memory_bytes_at_max: 32 * GI,
1392            max_cpu_per_container: 2.0,
1393            max_memory_per_container: 8 * GI,
1394            max_ephemeral_storage_bytes: 500 * GI,
1395            gpu: None,
1396        };
1397        let sel = select_instance_type(Platform::Aws, &req).unwrap();
1398        let spec = find_instance_type(Platform::Aws, sel.instance_type).unwrap();
1399        assert_eq!(spec.family, InstanceFamily::StorageOptimized);
1400    }
1401
1402    #[test]
1403    fn test_select_gpu_instance() {
1404        let req = WorkloadRequirements {
1405            total_cpu_at_max: 8.0,
1406            total_memory_bytes_at_max: 32 * GI,
1407            max_cpu_per_container: 4.0,
1408            max_memory_per_container: 16 * GI,
1409            max_ephemeral_storage_bytes: 10 * GI,
1410            gpu: Some(GpuSpec {
1411                gpu_type: "nvidia-a100".to_string(),
1412                count: 1,
1413            }),
1414        };
1415        let sel = select_instance_type(Platform::Aws, &req).unwrap();
1416        let spec = find_instance_type(Platform::Aws, sel.instance_type).unwrap();
1417        assert_eq!(spec.family, InstanceFamily::GpuCompute);
1418        assert!(spec.gpu.is_some());
1419    }
1420
1421    #[test]
1422    fn test_select_works_for_all_cloud_platforms() {
1423        let req = WorkloadRequirements {
1424            total_cpu_at_max: 4.0,
1425            total_memory_bytes_at_max: 16 * GI,
1426            max_cpu_per_container: 1.0,
1427            max_memory_per_container: 4 * GI,
1428            max_ephemeral_storage_bytes: 10 * GI,
1429            gpu: None,
1430        };
1431        for platform in [Platform::Aws, Platform::Gcp, Platform::Azure] {
1432            let sel = select_instance_type(platform, &req);
1433            assert!(sel.is_ok(), "selection failed for {platform}");
1434        }
1435    }
1436
1437    #[test]
1438    fn test_machine_count_reasonable() {
1439        // Single container: 1 CPU, 2Gi, maxReplicas=20
1440        let req = WorkloadRequirements {
1441            total_cpu_at_max: 20.0,
1442            total_memory_bytes_at_max: 40 * GI,
1443            max_cpu_per_container: 1.0,
1444            max_memory_per_container: 2 * GI,
1445            max_ephemeral_storage_bytes: 10 * GI,
1446            gpu: None,
1447        };
1448        let sel = select_instance_type(Platform::Aws, &req).unwrap();
1449        assert!(sel.min_machines >= 1);
1450        assert!(sel.max_machines <= MAX_MACHINES_PER_CLUSTER);
1451        assert!(sel.max_machines >= sel.min_machines);
1452    }
1453
1454    #[test]
1455    fn test_instance_size_capped_at_8_vcpu() {
1456        // Even with very large containers, instance size is capped at 8 vCPUs
1457        let req = WorkloadRequirements {
1458            total_cpu_at_max: 70.0,
1459            total_memory_bytes_at_max: 140 * GI,
1460            max_cpu_per_container: 2.0,
1461            max_memory_per_container: 4 * GI,
1462            max_ephemeral_storage_bytes: 10 * GI,
1463            gpu: None,
1464        };
1465        let sel = select_instance_type(Platform::Gcp, &req).unwrap();
1466        let spec = find_instance_type(Platform::Gcp, sel.instance_type).unwrap();
1467        assert!(
1468            spec.vcpu <= MAX_STANDARD_VCPU,
1469            "selected {} with {} vCPUs, expected <= {}",
1470            spec.name,
1471            spec.vcpu,
1472            MAX_STANDARD_VCPU
1473        );
1474        assert_eq!(spec.family, InstanceFamily::GeneralPurpose);
1475        // Should scale horizontally instead
1476        assert!(sel.max_machines > 1);
1477    }
1478
1479    #[test]
1480    fn test_agent_manager_stack_gets_reasonable_instance() {
1481        // Simulates the agent-manager stack: 4 containers, each 2 CPU / 4 GiB
1482        // maxReplicas: 10, 10, 10, 5
1483        let req = WorkloadRequirements {
1484            total_cpu_at_max: 70.0,              // 2*10 + 2*10 + 2*10 + 2*5
1485            total_memory_bytes_at_max: 140 * GI, // 4*10 + 4*10 + 4*10 + 4*5
1486            max_cpu_per_container: 2.0,
1487            max_memory_per_container: 4 * GI,
1488            max_ephemeral_storage_bytes: 20 * GI,
1489            gpu: None,
1490        };
1491        let sel = select_instance_type(Platform::Gcp, &req).unwrap();
1492        // Should pick n2-standard-8 (8 vCPU, 32 GiB) — NOT c3-standard-44
1493        assert_eq!(sel.instance_type, "n2-standard-8");
1494        assert!(sel.max_machines >= 2);
1495    }
1496
1497    #[test]
1498    fn test_profile_has_required_fields() {
1499        let req = WorkloadRequirements {
1500            total_cpu_at_max: 4.0,
1501            total_memory_bytes_at_max: 16 * GI,
1502            max_cpu_per_container: 1.0,
1503            max_memory_per_container: 4 * GI,
1504            max_ephemeral_storage_bytes: 10 * GI,
1505            gpu: None,
1506        };
1507        let sel = select_instance_type(Platform::Aws, &req).unwrap();
1508        assert!(!sel.profile.cpu.is_empty());
1509        assert!(sel.profile.memory_bytes > 0);
1510        assert!(sel.profile.ephemeral_storage_bytes > 0);
1511    }
1512
1513    #[test]
1514    fn test_error_for_unsupported_gpu_type() {
1515        let req = WorkloadRequirements {
1516            total_cpu_at_max: 8.0,
1517            total_memory_bytes_at_max: 32 * GI,
1518            max_cpu_per_container: 4.0,
1519            max_memory_per_container: 16 * GI,
1520            max_ephemeral_storage_bytes: 10 * GI,
1521            gpu: Some(GpuSpec {
1522                gpu_type: "amd-mi300".to_string(),
1523                count: 1,
1524            }),
1525        };
1526        let result = select_instance_type(Platform::Aws, &req);
1527        assert!(result.is_err());
1528    }
1529
1530    #[test]
1531    fn test_catalog_instance_types_sorted_by_vcpu_within_family() {
1532        // Verify that within each (platform, family) group, vcpu is non-decreasing.
1533        // This ensures our "min_by_key(vcpu)" logic works correctly.
1534        for platform in [Platform::Aws, Platform::Gcp, Platform::Azure] {
1535            let entries = catalog_for_platform(platform);
1536            let mut by_family: std::collections::HashMap<_, Vec<_>> =
1537                std::collections::HashMap::new();
1538            for entry in entries {
1539                by_family
1540                    .entry(format!("{:?}", entry.family))
1541                    .or_default()
1542                    .push(entry);
1543            }
1544            for (family, instances) in &by_family {
1545                for window in instances.windows(2) {
1546                    assert!(
1547                        window[0].vcpu <= window[1].vcpu,
1548                        "catalog not sorted by vcpu for {platform}/{family}: {} ({}) > {} ({})",
1549                        window[0].name,
1550                        window[0].vcpu,
1551                        window[1].name,
1552                        window[1].vcpu
1553                    );
1554                }
1555            }
1556        }
1557    }
1558}
alien_core/instance_catalog.rs

alien_core/
instance_catalog.rs