alien_core/
instance_catalog.rs

1//! Instance type catalog and selection algorithm for cloud compute infrastructure.
2//!
3//! This module provides:
4//! - A static catalog of known instance types across AWS, GCP, and Azure
5//! - Resource quantity parsing (CPU strings, Kubernetes-style memory/storage quantities)
6//! - An algorithm to select the optimal instance type for a given workload
7//!
8//! The catalog is the single source of truth for instance type specifications.
9//! It is used by the preflights system to automatically populate `CapacityGroup.instance_type`
10//! and `CapacityGroup.profile` based on the containers in a stack.
11
12use crate::{GpuSpec, MachineProfile, Platform};
13
14// ---------------------------------------------------------------------------
15// Resource quantity parsing
16// ---------------------------------------------------------------------------
17
18/// Parse a CPU quantity string to f64.
19///
20/// Accepts plain numbers ("1", "0.5", "2.0") and millicore suffixes ("500m" = 0.5).
21pub fn parse_cpu(s: &str) -> Result<f64, String> {
22    let s = s.trim();
23    if s.is_empty() {
24        return Err("empty CPU string".to_string());
25    }
26
27    if let Some(millis) = s.strip_suffix('m') {
28        let v: f64 = millis
29            .parse()
30            .map_err(|_| format!("invalid CPU millicore value: '{s}'"))?;
31        Ok(v / 1000.0)
32    } else {
33        s.parse().map_err(|_| format!("invalid CPU value: '{s}'"))
34    }
35}
36
37/// Parse a memory or storage quantity string to bytes.
38///
39/// Supports Kubernetes-style binary suffixes (Ki, Mi, Gi, Ti) and
40/// decimal suffixes (k, M, G, T). Plain numbers are interpreted as bytes.
41pub fn parse_memory_bytes(s: &str) -> Result<u64, String> {
42    let s = s.trim();
43    if s.is_empty() {
44        return Err("empty memory/storage string".to_string());
45    }
46
47    // Binary suffixes (powers of 1024)
48    if let Some(num) = s.strip_suffix("Ti") {
49        let v: f64 = num
50            .parse()
51            .map_err(|_| format!("invalid memory value: '{s}'"))?;
52        return Ok((v * 1024.0 * 1024.0 * 1024.0 * 1024.0) as u64);
53    }
54    if let Some(num) = s.strip_suffix("Gi") {
55        let v: f64 = num
56            .parse()
57            .map_err(|_| format!("invalid memory value: '{s}'"))?;
58        return Ok((v * 1024.0 * 1024.0 * 1024.0) as u64);
59    }
60    if let Some(num) = s.strip_suffix("Mi") {
61        let v: f64 = num
62            .parse()
63            .map_err(|_| format!("invalid memory value: '{s}'"))?;
64        return Ok((v * 1024.0 * 1024.0) as u64);
65    }
66    if let Some(num) = s.strip_suffix("Ki") {
67        let v: f64 = num
68            .parse()
69            .map_err(|_| format!("invalid memory value: '{s}'"))?;
70        return Ok((v * 1024.0) as u64);
71    }
72
73    // Decimal suffixes (powers of 1000)
74    if let Some(num) = s.strip_suffix('T') {
75        let v: f64 = num
76            .parse()
77            .map_err(|_| format!("invalid memory value: '{s}'"))?;
78        return Ok((v * 1_000_000_000_000.0) as u64);
79    }
80    if let Some(num) = s.strip_suffix('G') {
81        let v: f64 = num
82            .parse()
83            .map_err(|_| format!("invalid memory value: '{s}'"))?;
84        return Ok((v * 1_000_000_000.0) as u64);
85    }
86    if let Some(num) = s.strip_suffix('M') {
87        let v: f64 = num
88            .parse()
89            .map_err(|_| format!("invalid memory value: '{s}'"))?;
90        return Ok((v * 1_000_000.0) as u64);
91    }
92    if let Some(num) = s.strip_suffix('k') {
93        let v: f64 = num
94            .parse()
95            .map_err(|_| format!("invalid memory value: '{s}'"))?;
96        return Ok((v * 1000.0) as u64);
97    }
98
99    // Plain bytes
100    s.parse()
101        .map_err(|_| format!("invalid memory value: '{s}'"))
102}
103
104// ---------------------------------------------------------------------------
105// Instance type catalog
106// ---------------------------------------------------------------------------
107
108/// Instance family classification.
109#[derive(Debug, Clone, Copy, PartialEq, Eq)]
110pub enum InstanceFamily {
111    Burstable,
112    GeneralPurpose,
113    ComputeOptimized,
114    MemoryOptimized,
115    StorageOptimized,
116    GpuCompute,
117}
118
119/// CPU architecture.
120#[derive(Debug, Clone, Copy, PartialEq, Eq)]
121pub enum Architecture {
122    Arm64,
123    X86_64,
124}
125
126/// Static GPU specification for catalog entries (no heap allocation).
127#[derive(Debug, Clone, Copy, PartialEq, Eq)]
128pub struct CatalogGpu {
129    pub gpu_type: &'static str,
130    pub count: u32,
131}
132
133/// A known instance type with its hardware specifications.
134///
135/// All fields are compile-time constants. The catalog is a flat array of these.
136#[derive(Debug, Clone)]
137pub struct InstanceTypeSpec {
138    pub name: &'static str,
139    pub platform: Platform,
140    pub family: InstanceFamily,
141    pub architecture: Architecture,
142    /// vCPU count (hardware total)
143    pub vcpu: u32,
144    /// Memory in bytes (hardware total)
145    pub memory_bytes: u64,
146    /// Ephemeral storage in bytes (hardware total, NVMe for storage-optimized)
147    pub ephemeral_storage_bytes: u64,
148    /// GPU specification (for GPU instances)
149    pub gpu: Option<CatalogGpu>,
150}
151
152impl InstanceTypeSpec {
153    /// Convert this catalog entry into a `MachineProfile` for use in `CapacityGroup`.
154    pub fn to_machine_profile(&self) -> MachineProfile {
155        MachineProfile {
156            cpu: format!("{}.0", self.vcpu),
157            memory_bytes: self.memory_bytes,
158            ephemeral_storage_bytes: self.ephemeral_storage_bytes,
159            gpu: self.gpu.map(|g| GpuSpec {
160                gpu_type: g.gpu_type.to_string(),
161                count: g.count,
162            }),
163        }
164    }
165}
166
167// Helpers for readable byte constants
168const KI: u64 = 1024;
169const MI: u64 = KI * 1024;
170const GI: u64 = MI * 1024;
171
172/// The complete instance type catalog.
173///
174/// This is the single source of truth for instance type specifications.
175/// Update this array when adding support for new instance types.
176///
177/// NOTE: Ephemeral storage values for non-NVMe instances are conservative defaults
178/// (EBS-backed root volumes). Storage-optimized instances list their NVMe capacity.
179static CATALOG: &[InstanceTypeSpec] = &[
180    // =========================================================================
181    // AWS — ARM (Graviton) preferred for cost efficiency
182    // =========================================================================
183
184    // Burstable (t4g — ARM Graviton2)
185    InstanceTypeSpec {
186        name: "t4g.micro",
187        platform: Platform::Aws,
188        family: InstanceFamily::Burstable,
189        architecture: Architecture::Arm64,
190        vcpu: 2,
191        memory_bytes: 1 * GI,
192        ephemeral_storage_bytes: 20 * GI,
193        gpu: None,
194    },
195    InstanceTypeSpec {
196        name: "t4g.small",
197        platform: Platform::Aws,
198        family: InstanceFamily::Burstable,
199        architecture: Architecture::Arm64,
200        vcpu: 2,
201        memory_bytes: 2 * GI,
202        ephemeral_storage_bytes: 20 * GI,
203        gpu: None,
204    },
205    InstanceTypeSpec {
206        name: "t4g.medium",
207        platform: Platform::Aws,
208        family: InstanceFamily::Burstable,
209        architecture: Architecture::Arm64,
210        vcpu: 2,
211        memory_bytes: 4 * GI,
212        ephemeral_storage_bytes: 20 * GI,
213        gpu: None,
214    },
215    InstanceTypeSpec {
216        name: "t4g.large",
217        platform: Platform::Aws,
218        family: InstanceFamily::Burstable,
219        architecture: Architecture::Arm64,
220        vcpu: 2,
221        memory_bytes: 8 * GI,
222        ephemeral_storage_bytes: 20 * GI,
223        gpu: None,
224    },
225    InstanceTypeSpec {
226        name: "t4g.xlarge",
227        platform: Platform::Aws,
228        family: InstanceFamily::Burstable,
229        architecture: Architecture::Arm64,
230        vcpu: 4,
231        memory_bytes: 16 * GI,
232        ephemeral_storage_bytes: 20 * GI,
233        gpu: None,
234    },
235    // General Purpose (m7g — ARM Graviton3, up to 2xlarge / 8 vCPU)
236    InstanceTypeSpec {
237        name: "m7g.medium",
238        platform: Platform::Aws,
239        family: InstanceFamily::GeneralPurpose,
240        architecture: Architecture::Arm64,
241        vcpu: 1,
242        memory_bytes: 4 * GI,
243        ephemeral_storage_bytes: 20 * GI,
244        gpu: None,
245    },
246    InstanceTypeSpec {
247        name: "m7g.large",
248        platform: Platform::Aws,
249        family: InstanceFamily::GeneralPurpose,
250        architecture: Architecture::Arm64,
251        vcpu: 2,
252        memory_bytes: 8 * GI,
253        ephemeral_storage_bytes: 20 * GI,
254        gpu: None,
255    },
256    InstanceTypeSpec {
257        name: "m7g.xlarge",
258        platform: Platform::Aws,
259        family: InstanceFamily::GeneralPurpose,
260        architecture: Architecture::Arm64,
261        vcpu: 4,
262        memory_bytes: 16 * GI,
263        ephemeral_storage_bytes: 20 * GI,
264        gpu: None,
265    },
266    InstanceTypeSpec {
267        name: "m7g.2xlarge",
268        platform: Platform::Aws,
269        family: InstanceFamily::GeneralPurpose,
270        architecture: Architecture::Arm64,
271        vcpu: 8,
272        memory_bytes: 32 * GI,
273        ephemeral_storage_bytes: 20 * GI,
274        gpu: None,
275    },
276    InstanceTypeSpec {
277        name: "m7g.4xlarge",
278        platform: Platform::Aws,
279        family: InstanceFamily::GeneralPurpose,
280        architecture: Architecture::Arm64,
281        vcpu: 16,
282        memory_bytes: 64 * GI,
283        ephemeral_storage_bytes: 20 * GI,
284        gpu: None,
285    },
286    // Compute Optimized (c7g — ARM Graviton3, up to 2xlarge / 8 vCPU)
287    InstanceTypeSpec {
288        name: "c7g.medium",
289        platform: Platform::Aws,
290        family: InstanceFamily::ComputeOptimized,
291        architecture: Architecture::Arm64,
292        vcpu: 1,
293        memory_bytes: 2 * GI,
294        ephemeral_storage_bytes: 20 * GI,
295        gpu: None,
296    },
297    InstanceTypeSpec {
298        name: "c7g.large",
299        platform: Platform::Aws,
300        family: InstanceFamily::ComputeOptimized,
301        architecture: Architecture::Arm64,
302        vcpu: 2,
303        memory_bytes: 4 * GI,
304        ephemeral_storage_bytes: 20 * GI,
305        gpu: None,
306    },
307    InstanceTypeSpec {
308        name: "c7g.xlarge",
309        platform: Platform::Aws,
310        family: InstanceFamily::ComputeOptimized,
311        architecture: Architecture::Arm64,
312        vcpu: 4,
313        memory_bytes: 8 * GI,
314        ephemeral_storage_bytes: 20 * GI,
315        gpu: None,
316    },
317    InstanceTypeSpec {
318        name: "c7g.2xlarge",
319        platform: Platform::Aws,
320        family: InstanceFamily::ComputeOptimized,
321        architecture: Architecture::Arm64,
322        vcpu: 8,
323        memory_bytes: 16 * GI,
324        ephemeral_storage_bytes: 20 * GI,
325        gpu: None,
326    },
327    InstanceTypeSpec {
328        name: "c7g.4xlarge",
329        platform: Platform::Aws,
330        family: InstanceFamily::ComputeOptimized,
331        architecture: Architecture::Arm64,
332        vcpu: 16,
333        memory_bytes: 32 * GI,
334        ephemeral_storage_bytes: 20 * GI,
335        gpu: None,
336    },
337    // Memory Optimized (r7g — ARM Graviton3, up to 2xlarge / 8 vCPU)
338    InstanceTypeSpec {
339        name: "r7g.medium",
340        platform: Platform::Aws,
341        family: InstanceFamily::MemoryOptimized,
342        architecture: Architecture::Arm64,
343        vcpu: 1,
344        memory_bytes: 8 * GI,
345        ephemeral_storage_bytes: 20 * GI,
346        gpu: None,
347    },
348    InstanceTypeSpec {
349        name: "r7g.large",
350        platform: Platform::Aws,
351        family: InstanceFamily::MemoryOptimized,
352        architecture: Architecture::Arm64,
353        vcpu: 2,
354        memory_bytes: 16 * GI,
355        ephemeral_storage_bytes: 20 * GI,
356        gpu: None,
357    },
358    InstanceTypeSpec {
359        name: "r7g.xlarge",
360        platform: Platform::Aws,
361        family: InstanceFamily::MemoryOptimized,
362        architecture: Architecture::Arm64,
363        vcpu: 4,
364        memory_bytes: 32 * GI,
365        ephemeral_storage_bytes: 20 * GI,
366        gpu: None,
367    },
368    InstanceTypeSpec {
369        name: "r7g.2xlarge",
370        platform: Platform::Aws,
371        family: InstanceFamily::MemoryOptimized,
372        architecture: Architecture::Arm64,
373        vcpu: 8,
374        memory_bytes: 64 * GI,
375        ephemeral_storage_bytes: 20 * GI,
376        gpu: None,
377    },
378    InstanceTypeSpec {
379        name: "r7g.4xlarge",
380        platform: Platform::Aws,
381        family: InstanceFamily::MemoryOptimized,
382        architecture: Architecture::Arm64,
383        vcpu: 16,
384        memory_bytes: 128 * GI,
385        ephemeral_storage_bytes: 20 * GI,
386        gpu: None,
387    },
388    // Storage Optimized (i4i — x86_64, NVMe)
389    InstanceTypeSpec {
390        name: "i4i.xlarge",
391        platform: Platform::Aws,
392        family: InstanceFamily::StorageOptimized,
393        architecture: Architecture::X86_64,
394        vcpu: 4,
395        memory_bytes: 32 * GI,
396        ephemeral_storage_bytes: 937 * GI,
397        gpu: None,
398    },
399    InstanceTypeSpec {
400        name: "i4i.2xlarge",
401        platform: Platform::Aws,
402        family: InstanceFamily::StorageOptimized,
403        architecture: Architecture::X86_64,
404        vcpu: 8,
405        memory_bytes: 64 * GI,
406        ephemeral_storage_bytes: 1875 * GI,
407        gpu: None,
408    },
409    InstanceTypeSpec {
410        name: "i4i.4xlarge",
411        platform: Platform::Aws,
412        family: InstanceFamily::StorageOptimized,
413        architecture: Architecture::X86_64,
414        vcpu: 16,
415        memory_bytes: 128 * GI,
416        ephemeral_storage_bytes: 3750 * GI,
417        gpu: None,
418    },
419    InstanceTypeSpec {
420        name: "i4i.8xlarge",
421        platform: Platform::Aws,
422        family: InstanceFamily::StorageOptimized,
423        architecture: Architecture::X86_64,
424        vcpu: 32,
425        memory_bytes: 256 * GI,
426        ephemeral_storage_bytes: 7500 * GI,
427        gpu: None,
428    },
429    // GPU — NVIDIA T4 (g5 — x86_64)
430    InstanceTypeSpec {
431        name: "g5.xlarge",
432        platform: Platform::Aws,
433        family: InstanceFamily::GpuCompute,
434        architecture: Architecture::X86_64,
435        vcpu: 4,
436        memory_bytes: 16 * GI,
437        ephemeral_storage_bytes: 250 * GI,
438        gpu: Some(CatalogGpu {
439            gpu_type: "nvidia-t4",
440            count: 1,
441        }),
442    },
443    InstanceTypeSpec {
444        name: "g5.2xlarge",
445        platform: Platform::Aws,
446        family: InstanceFamily::GpuCompute,
447        architecture: Architecture::X86_64,
448        vcpu: 8,
449        memory_bytes: 32 * GI,
450        ephemeral_storage_bytes: 450 * GI,
451        gpu: Some(CatalogGpu {
452            gpu_type: "nvidia-t4",
453            count: 1,
454        }),
455    },
456    // GPU — NVIDIA A100 (p4d — x86_64)
457    InstanceTypeSpec {
458        name: "p4d.24xlarge",
459        platform: Platform::Aws,
460        family: InstanceFamily::GpuCompute,
461        architecture: Architecture::X86_64,
462        vcpu: 96,
463        memory_bytes: 1152 * GI,
464        ephemeral_storage_bytes: 8000 * GI,
465        gpu: Some(CatalogGpu {
466            gpu_type: "nvidia-a100",
467            count: 8,
468        }),
469    },
470    // GPU — NVIDIA H100 (p5 — x86_64)
471    InstanceTypeSpec {
472        name: "p5.48xlarge",
473        platform: Platform::Aws,
474        family: InstanceFamily::GpuCompute,
475        architecture: Architecture::X86_64,
476        vcpu: 192,
477        memory_bytes: 2048 * GI,
478        ephemeral_storage_bytes: 8000 * GI,
479        gpu: Some(CatalogGpu {
480            gpu_type: "nvidia-h100",
481            count: 8,
482        }),
483    },
484    // =========================================================================
485    // GCP
486    // =========================================================================
487
488    // Burstable (e2)
489    InstanceTypeSpec {
490        name: "e2-micro",
491        platform: Platform::Gcp,
492        family: InstanceFamily::Burstable,
493        architecture: Architecture::X86_64,
494        vcpu: 2,
495        memory_bytes: 1 * GI,
496        ephemeral_storage_bytes: 20 * GI,
497        gpu: None,
498    },
499    InstanceTypeSpec {
500        name: "e2-small",
501        platform: Platform::Gcp,
502        family: InstanceFamily::Burstable,
503        architecture: Architecture::X86_64,
504        vcpu: 2,
505        memory_bytes: 2 * GI,
506        ephemeral_storage_bytes: 20 * GI,
507        gpu: None,
508    },
509    InstanceTypeSpec {
510        name: "e2-medium",
511        platform: Platform::Gcp,
512        family: InstanceFamily::Burstable,
513        architecture: Architecture::X86_64,
514        vcpu: 2,
515        memory_bytes: 4 * GI,
516        ephemeral_storage_bytes: 20 * GI,
517        gpu: None,
518    },
519    // General Purpose (n2-standard, up to 16 vCPU)
520    InstanceTypeSpec {
521        name: "n2-standard-2",
522        platform: Platform::Gcp,
523        family: InstanceFamily::GeneralPurpose,
524        architecture: Architecture::X86_64,
525        vcpu: 2,
526        memory_bytes: 8 * GI,
527        ephemeral_storage_bytes: 20 * GI,
528        gpu: None,
529    },
530    InstanceTypeSpec {
531        name: "n2-standard-4",
532        platform: Platform::Gcp,
533        family: InstanceFamily::GeneralPurpose,
534        architecture: Architecture::X86_64,
535        vcpu: 4,
536        memory_bytes: 16 * GI,
537        ephemeral_storage_bytes: 20 * GI,
538        gpu: None,
539    },
540    InstanceTypeSpec {
541        name: "n2-standard-8",
542        platform: Platform::Gcp,
543        family: InstanceFamily::GeneralPurpose,
544        architecture: Architecture::X86_64,
545        vcpu: 8,
546        memory_bytes: 32 * GI,
547        ephemeral_storage_bytes: 20 * GI,
548        gpu: None,
549    },
550    InstanceTypeSpec {
551        name: "n2-standard-16",
552        platform: Platform::Gcp,
553        family: InstanceFamily::GeneralPurpose,
554        architecture: Architecture::X86_64,
555        vcpu: 16,
556        memory_bytes: 64 * GI,
557        ephemeral_storage_bytes: 20 * GI,
558        gpu: None,
559    },
560    // Compute Optimized (c3-standard, up to 8 vCPU)
561    InstanceTypeSpec {
562        name: "c3-standard-4",
563        platform: Platform::Gcp,
564        family: InstanceFamily::ComputeOptimized,
565        architecture: Architecture::X86_64,
566        vcpu: 4,
567        memory_bytes: 8 * GI,
568        ephemeral_storage_bytes: 20 * GI,
569        gpu: None,
570    },
571    InstanceTypeSpec {
572        name: "c3-standard-8",
573        platform: Platform::Gcp,
574        family: InstanceFamily::ComputeOptimized,
575        architecture: Architecture::X86_64,
576        vcpu: 8,
577        memory_bytes: 16 * GI,
578        ephemeral_storage_bytes: 20 * GI,
579        gpu: None,
580    },
581    // Memory Optimized (n2-highmem, up to 8 vCPU)
582    InstanceTypeSpec {
583        name: "n2-highmem-2",
584        platform: Platform::Gcp,
585        family: InstanceFamily::MemoryOptimized,
586        architecture: Architecture::X86_64,
587        vcpu: 2,
588        memory_bytes: 16 * GI,
589        ephemeral_storage_bytes: 20 * GI,
590        gpu: None,
591    },
592    InstanceTypeSpec {
593        name: "n2-highmem-4",
594        platform: Platform::Gcp,
595        family: InstanceFamily::MemoryOptimized,
596        architecture: Architecture::X86_64,
597        vcpu: 4,
598        memory_bytes: 32 * GI,
599        ephemeral_storage_bytes: 20 * GI,
600        gpu: None,
601    },
602    InstanceTypeSpec {
603        name: "n2-highmem-8",
604        platform: Platform::Gcp,
605        family: InstanceFamily::MemoryOptimized,
606        architecture: Architecture::X86_64,
607        vcpu: 8,
608        memory_bytes: 64 * GI,
609        ephemeral_storage_bytes: 20 * GI,
610        gpu: None,
611    },
612    InstanceTypeSpec {
613        name: "n2-highmem-16",
614        platform: Platform::Gcp,
615        family: InstanceFamily::MemoryOptimized,
616        architecture: Architecture::X86_64,
617        vcpu: 16,
618        memory_bytes: 128 * GI,
619        ephemeral_storage_bytes: 20 * GI,
620        gpu: None,
621    },
622    InstanceTypeSpec {
623        name: "n2-highmem-32",
624        platform: Platform::Gcp,
625        family: InstanceFamily::MemoryOptimized,
626        architecture: Architecture::X86_64,
627        vcpu: 32,
628        memory_bytes: 256 * GI,
629        ephemeral_storage_bytes: 20 * GI,
630        gpu: None,
631    },
632    // Storage Optimized (c3d-standard with local SSD)
633    InstanceTypeSpec {
634        name: "c3d-standard-8",
635        platform: Platform::Gcp,
636        family: InstanceFamily::StorageOptimized,
637        architecture: Architecture::X86_64,
638        vcpu: 8,
639        memory_bytes: 32 * GI,
640        ephemeral_storage_bytes: 480 * GI,
641        gpu: None,
642    },
643    InstanceTypeSpec {
644        name: "c3d-standard-16",
645        platform: Platform::Gcp,
646        family: InstanceFamily::StorageOptimized,
647        architecture: Architecture::X86_64,
648        vcpu: 16,
649        memory_bytes: 64 * GI,
650        ephemeral_storage_bytes: 960 * GI,
651        gpu: None,
652    },
653    InstanceTypeSpec {
654        name: "c3d-standard-30",
655        platform: Platform::Gcp,
656        family: InstanceFamily::StorageOptimized,
657        architecture: Architecture::X86_64,
658        vcpu: 30,
659        memory_bytes: 120 * GI,
660        ephemeral_storage_bytes: 1920 * GI,
661        gpu: None,
662    },
663    // GPU — NVIDIA T4 (n1-standard + T4)
664    InstanceTypeSpec {
665        name: "n1-standard-4-t4",
666        platform: Platform::Gcp,
667        family: InstanceFamily::GpuCompute,
668        architecture: Architecture::X86_64,
669        vcpu: 4,
670        memory_bytes: 15 * GI,
671        ephemeral_storage_bytes: 100 * GI,
672        gpu: Some(CatalogGpu {
673            gpu_type: "nvidia-t4",
674            count: 1,
675        }),
676    },
677    // GPU — NVIDIA A100 (a2-highgpu)
678    InstanceTypeSpec {
679        name: "a2-highgpu-1g",
680        platform: Platform::Gcp,
681        family: InstanceFamily::GpuCompute,
682        architecture: Architecture::X86_64,
683        vcpu: 12,
684        memory_bytes: 85 * GI,
685        ephemeral_storage_bytes: 100 * GI,
686        gpu: Some(CatalogGpu {
687            gpu_type: "nvidia-a100",
688            count: 1,
689        }),
690    },
691    InstanceTypeSpec {
692        name: "a2-highgpu-8g",
693        platform: Platform::Gcp,
694        family: InstanceFamily::GpuCompute,
695        architecture: Architecture::X86_64,
696        vcpu: 96,
697        memory_bytes: 1360 * GI,
698        ephemeral_storage_bytes: 100 * GI,
699        gpu: Some(CatalogGpu {
700            gpu_type: "nvidia-a100",
701            count: 8,
702        }),
703    },
704    // GPU — NVIDIA H100 (a3-highgpu)
705    InstanceTypeSpec {
706        name: "a3-highgpu-8g",
707        platform: Platform::Gcp,
708        family: InstanceFamily::GpuCompute,
709        architecture: Architecture::X86_64,
710        vcpu: 208,
711        memory_bytes: 1872 * GI,
712        ephemeral_storage_bytes: 100 * GI,
713        gpu: Some(CatalogGpu {
714            gpu_type: "nvidia-h100",
715            count: 8,
716        }),
717    },
718    // =========================================================================
719    // Azure
720    // =========================================================================
721
722    // Burstable (B-series v2)
723    InstanceTypeSpec {
724        name: "Standard_B1s",
725        platform: Platform::Azure,
726        family: InstanceFamily::Burstable,
727        architecture: Architecture::X86_64,
728        vcpu: 1,
729        memory_bytes: 1 * GI,
730        ephemeral_storage_bytes: 20 * GI,
731        gpu: None,
732    },
733    InstanceTypeSpec {
734        name: "Standard_B2s",
735        platform: Platform::Azure,
736        family: InstanceFamily::Burstable,
737        architecture: Architecture::X86_64,
738        vcpu: 2,
739        memory_bytes: 4 * GI,
740        ephemeral_storage_bytes: 20 * GI,
741        gpu: None,
742    },
743    InstanceTypeSpec {
744        name: "Standard_B2ms",
745        platform: Platform::Azure,
746        family: InstanceFamily::Burstable,
747        architecture: Architecture::X86_64,
748        vcpu: 2,
749        memory_bytes: 8 * GI,
750        ephemeral_storage_bytes: 20 * GI,
751        gpu: None,
752    },
753    InstanceTypeSpec {
754        name: "Standard_B4ms",
755        platform: Platform::Azure,
756        family: InstanceFamily::Burstable,
757        architecture: Architecture::X86_64,
758        vcpu: 4,
759        memory_bytes: 16 * GI,
760        ephemeral_storage_bytes: 20 * GI,
761        gpu: None,
762    },
763    // General Purpose (Dv5-series, up to 16 vCPU)
764    InstanceTypeSpec {
765        name: "Standard_D2s_v5",
766        platform: Platform::Azure,
767        family: InstanceFamily::GeneralPurpose,
768        architecture: Architecture::X86_64,
769        vcpu: 2,
770        memory_bytes: 8 * GI,
771        ephemeral_storage_bytes: 20 * GI,
772        gpu: None,
773    },
774    InstanceTypeSpec {
775        name: "Standard_D4s_v5",
776        platform: Platform::Azure,
777        family: InstanceFamily::GeneralPurpose,
778        architecture: Architecture::X86_64,
779        vcpu: 4,
780        memory_bytes: 16 * GI,
781        ephemeral_storage_bytes: 20 * GI,
782        gpu: None,
783    },
784    InstanceTypeSpec {
785        name: "Standard_D8s_v5",
786        platform: Platform::Azure,
787        family: InstanceFamily::GeneralPurpose,
788        architecture: Architecture::X86_64,
789        vcpu: 8,
790        memory_bytes: 32 * GI,
791        ephemeral_storage_bytes: 20 * GI,
792        gpu: None,
793    },
794    InstanceTypeSpec {
795        name: "Standard_D16s_v5",
796        platform: Platform::Azure,
797        family: InstanceFamily::GeneralPurpose,
798        architecture: Architecture::X86_64,
799        vcpu: 16,
800        memory_bytes: 64 * GI,
801        ephemeral_storage_bytes: 20 * GI,
802        gpu: None,
803    },
804    // Compute Optimized (Fv2-series, up to 16 vCPU)
805    InstanceTypeSpec {
806        name: "Standard_F2s_v2",
807        platform: Platform::Azure,
808        family: InstanceFamily::ComputeOptimized,
809        architecture: Architecture::X86_64,
810        vcpu: 2,
811        memory_bytes: 4 * GI,
812        ephemeral_storage_bytes: 20 * GI,
813        gpu: None,
814    },
815    InstanceTypeSpec {
816        name: "Standard_F4s_v2",
817        platform: Platform::Azure,
818        family: InstanceFamily::ComputeOptimized,
819        architecture: Architecture::X86_64,
820        vcpu: 4,
821        memory_bytes: 8 * GI,
822        ephemeral_storage_bytes: 20 * GI,
823        gpu: None,
824    },
825    InstanceTypeSpec {
826        name: "Standard_F8s_v2",
827        platform: Platform::Azure,
828        family: InstanceFamily::ComputeOptimized,
829        architecture: Architecture::X86_64,
830        vcpu: 8,
831        memory_bytes: 16 * GI,
832        ephemeral_storage_bytes: 20 * GI,
833        gpu: None,
834    },
835    InstanceTypeSpec {
836        name: "Standard_F16s_v2",
837        platform: Platform::Azure,
838        family: InstanceFamily::ComputeOptimized,
839        architecture: Architecture::X86_64,
840        vcpu: 16,
841        memory_bytes: 32 * GI,
842        ephemeral_storage_bytes: 20 * GI,
843        gpu: None,
844    },
845    // Memory Optimized (Ev5-series, up to 16 vCPU)
846    InstanceTypeSpec {
847        name: "Standard_E2s_v5",
848        platform: Platform::Azure,
849        family: InstanceFamily::MemoryOptimized,
850        architecture: Architecture::X86_64,
851        vcpu: 2,
852        memory_bytes: 16 * GI,
853        ephemeral_storage_bytes: 20 * GI,
854        gpu: None,
855    },
856    InstanceTypeSpec {
857        name: "Standard_E4s_v5",
858        platform: Platform::Azure,
859        family: InstanceFamily::MemoryOptimized,
860        architecture: Architecture::X86_64,
861        vcpu: 4,
862        memory_bytes: 32 * GI,
863        ephemeral_storage_bytes: 20 * GI,
864        gpu: None,
865    },
866    InstanceTypeSpec {
867        name: "Standard_E8s_v5",
868        platform: Platform::Azure,
869        family: InstanceFamily::MemoryOptimized,
870        architecture: Architecture::X86_64,
871        vcpu: 8,
872        memory_bytes: 64 * GI,
873        ephemeral_storage_bytes: 20 * GI,
874        gpu: None,
875    },
876    InstanceTypeSpec {
877        name: "Standard_E16s_v5",
878        platform: Platform::Azure,
879        family: InstanceFamily::MemoryOptimized,
880        architecture: Architecture::X86_64,
881        vcpu: 16,
882        memory_bytes: 128 * GI,
883        ephemeral_storage_bytes: 20 * GI,
884        gpu: None,
885    },
886    // Storage Optimized (Lsv3-series with NVMe)
887    InstanceTypeSpec {
888        name: "Standard_L8s_v3",
889        platform: Platform::Azure,
890        family: InstanceFamily::StorageOptimized,
891        architecture: Architecture::X86_64,
892        vcpu: 8,
893        memory_bytes: 64 * GI,
894        ephemeral_storage_bytes: 1788 * GI,
895        gpu: None,
896    },
897    InstanceTypeSpec {
898        name: "Standard_L16s_v3",
899        platform: Platform::Azure,
900        family: InstanceFamily::StorageOptimized,
901        architecture: Architecture::X86_64,
902        vcpu: 16,
903        memory_bytes: 128 * GI,
904        ephemeral_storage_bytes: 3576 * GI,
905        gpu: None,
906    },
907    InstanceTypeSpec {
908        name: "Standard_L32s_v3",
909        platform: Platform::Azure,
910        family: InstanceFamily::StorageOptimized,
911        architecture: Architecture::X86_64,
912        vcpu: 32,
913        memory_bytes: 256 * GI,
914        ephemeral_storage_bytes: 7154 * GI,
915        gpu: None,
916    },
917    // GPU — NVIDIA T4 (NCasT4_v3-series)
918    InstanceTypeSpec {
919        name: "Standard_NC4as_T4_v3",
920        platform: Platform::Azure,
921        family: InstanceFamily::GpuCompute,
922        architecture: Architecture::X86_64,
923        vcpu: 4,
924        memory_bytes: 28 * GI,
925        ephemeral_storage_bytes: 176 * GI,
926        gpu: Some(CatalogGpu {
927            gpu_type: "nvidia-t4",
928            count: 1,
929        }),
930    },
931    // GPU — NVIDIA A100 (NC A100 v4-series)
932    InstanceTypeSpec {
933        name: "Standard_NC24ads_A100_v4",
934        platform: Platform::Azure,
935        family: InstanceFamily::GpuCompute,
936        architecture: Architecture::X86_64,
937        vcpu: 24,
938        memory_bytes: 220 * GI,
939        ephemeral_storage_bytes: 958 * GI,
940        gpu: Some(CatalogGpu {
941            gpu_type: "nvidia-a100",
942            count: 1,
943        }),
944    },
945    InstanceTypeSpec {
946        name: "Standard_NC96ads_A100_v4",
947        platform: Platform::Azure,
948        family: InstanceFamily::GpuCompute,
949        architecture: Architecture::X86_64,
950        vcpu: 96,
951        memory_bytes: 880 * GI,
952        ephemeral_storage_bytes: 3916 * GI,
953        gpu: Some(CatalogGpu {
954            gpu_type: "nvidia-a100",
955            count: 4,
956        }),
957    },
958    // GPU — NVIDIA H100 (ND H100 v5-series)
959    InstanceTypeSpec {
960        name: "Standard_ND96isr_H100_v5",
961        platform: Platform::Azure,
962        family: InstanceFamily::GpuCompute,
963        architecture: Architecture::X86_64,
964        vcpu: 96,
965        memory_bytes: 1900 * GI,
966        ephemeral_storage_bytes: 1000 * GI,
967        gpu: Some(CatalogGpu {
968            gpu_type: "nvidia-h100",
969            count: 8,
970        }),
971    },
972];
973
974// ---------------------------------------------------------------------------
975// Catalog lookup
976// ---------------------------------------------------------------------------
977
978/// Get all instance types for a given platform.
979pub fn catalog_for_platform(platform: Platform) -> Vec<&'static InstanceTypeSpec> {
980    CATALOG
981        .iter()
982        .filter(|spec| spec.platform == platform)
983        .collect()
984}
985
986/// Find a specific instance type by name and platform.
987pub fn find_instance_type(platform: Platform, name: &str) -> Option<&'static InstanceTypeSpec> {
988    CATALOG
989        .iter()
990        .find(|spec| spec.platform == platform && spec.name == name)
991}
992
993// ---------------------------------------------------------------------------
994// Instance type selection
995// ---------------------------------------------------------------------------
996
997/// Aggregated resource requirements from all containers in a capacity group.
998#[derive(Debug, Clone)]
999pub struct WorkloadRequirements {
1000    /// Total CPU needed at desired scale (sum of desired CPU * desired_replicas per container)
1001    pub total_cpu_at_desired: f64,
1002    /// Total memory needed at desired scale (sum of desired memory * desired_replicas per container)
1003    pub total_memory_bytes_at_desired: u64,
1004    /// Total CPU needed at maximum scale (sum of desired CPU * max_replicas per container)
1005    pub total_cpu_at_max: f64,
1006    /// Total memory needed at maximum scale (sum of desired memory * max_replicas per container)
1007    pub total_memory_bytes_at_max: u64,
1008    /// Largest CPU request among all individual containers (single replica)
1009    pub max_cpu_per_container: f64,
1010    /// Largest memory request among all individual containers (single replica)
1011    pub max_memory_per_container: u64,
1012    /// Maximum ephemeral storage any single container requires
1013    pub max_ephemeral_storage_bytes: u64,
1014    /// GPU requirement (if any container needs GPU)
1015    pub gpu: Option<GpuSpec>,
1016}
1017
1018/// Result of instance type selection.
1019#[derive(Debug, Clone)]
1020pub struct InstanceSelection {
1021    /// Selected instance type name (e.g., "m7g.2xlarge")
1022    pub instance_type: &'static str,
1023    /// Machine profile derived from the instance type
1024    pub profile: MachineProfile,
1025    /// Recommended minimum number of machines
1026    pub min_machines: u32,
1027    /// Recommended maximum number of machines
1028    pub max_machines: u32,
1029}
1030
1031/// Ephemeral storage threshold above which storage-optimized instances are selected.
1032const STORAGE_OPTIMIZED_THRESHOLD: u64 = 200 * GI;
1033
1034/// Maximum number of machines per cluster.
1035const MAX_MACHINES_PER_CLUSTER: u32 = 10;
1036
1037/// Hard cap on vCPUs for non-GPU/non-storage workloads. Equivalent to AWS 2xlarge.
1038/// Beyond this, horizontal scaling is always preferred over bigger machines.
1039const MAX_STANDARD_VCPU: u32 = 8;
1040
1041/// How many of the largest standard container we want to fit per machine.
1042const STANDARD_CONTAINERS_PER_MACHINE: f64 = 2.0;
1043
1044/// Overhead factor for system processes and bin-packing inefficiency.
1045const OVERHEAD_FACTOR: f64 = 1.25;
1046
1047/// Runtime CPU reserved for system processes on each managed container machine.
1048const SYSTEM_RESERVE_CPU: f64 = 0.5;
1049
1050/// Runtime planning headroom for total desired/max workload.
1051const WORKLOAD_HEADROOM_FACTOR: f64 = 1.15;
1052
1053/// Select the best instance type for a workload on a given platform.
1054///
1055/// The algorithm:
1056/// 1. GPU workloads: Match by GPU type, find smallest instance with enough GPUs.
1057/// 2. Storage-heavy workloads (>200Gi ephemeral): Use storage-optimized instances.
1058/// 3. All other workloads: Size the machine to fit a small HA-friendly baseline,
1059///    capped at 8 vCPUs. Use GeneralPurpose family for broad availability and
1060///    reasonable cost. Scale horizontally for more capacity.
1061///
1062/// Returns an error if no suitable instance type is found.
1063pub fn select_instance_type(
1064    platform: Platform,
1065    requirements: &WorkloadRequirements,
1066) -> Result<InstanceSelection, String> {
1067    // Determine which family to use
1068    let family = select_family(requirements);
1069
1070    // Filter catalog to matching platform + family
1071    let candidates: Vec<&InstanceTypeSpec> = CATALOG
1072        .iter()
1073        .filter(|spec| spec.platform == platform && spec.family == family)
1074        .collect();
1075
1076    if candidates.is_empty() {
1077        return Err(format!(
1078            "no {family:?} instance types in catalog for platform {platform}"
1079        ));
1080    }
1081
1082    // For GPU workloads, filter by GPU type
1083    let candidates = if let Some(ref gpu) = requirements.gpu {
1084        let filtered: Vec<&InstanceTypeSpec> = candidates
1085            .into_iter()
1086            .filter(|spec| {
1087                spec.gpu.as_ref().map_or(false, |g| {
1088                    g.gpu_type == gpu.gpu_type && g.count >= gpu.count
1089                })
1090            })
1091            .collect();
1092        if filtered.is_empty() {
1093            return Err(format!(
1094                "no instance type for GPU type '{}' x{} on platform {platform}",
1095                gpu.gpu_type, gpu.count
1096            ));
1097        }
1098        filtered
1099    } else {
1100        candidates
1101    };
1102
1103    // For storage workloads, filter by ephemeral storage capacity
1104    let candidates = if family == InstanceFamily::StorageOptimized {
1105        let filtered: Vec<&InstanceTypeSpec> = candidates
1106            .into_iter()
1107            .filter(|spec| spec.ephemeral_storage_bytes >= requirements.max_ephemeral_storage_bytes)
1108            .collect();
1109        if filtered.is_empty() {
1110            return Err(format!(
1111                "no storage-optimized instance with >= {} bytes ephemeral storage on platform {platform}",
1112                requirements.max_ephemeral_storage_bytes
1113            ));
1114        }
1115        filtered
1116    } else {
1117        candidates
1118    };
1119
1120    // Cap at MAX_STANDARD_VCPU for non-GPU/non-storage workloads
1121    let vcpu_cap =
1122        if family == InstanceFamily::GpuCompute || family == InstanceFamily::StorageOptimized {
1123            u32::MAX
1124        } else {
1125            MAX_STANDARD_VCPU
1126        };
1127
1128    let desired_target_machines = desired_target_machines(requirements);
1129    let target_cpu =
1130        (requirements.max_cpu_per_container * STANDARD_CONTAINERS_PER_MACHINE * OVERHEAD_FACTOR)
1131            .max(
1132                requirements.total_cpu_at_desired * WORKLOAD_HEADROOM_FACTOR
1133                    / desired_target_machines as f64,
1134            )
1135            .max(0.25);
1136    let target_memory = (requirements.max_memory_per_container as f64
1137        * STANDARD_CONTAINERS_PER_MACHINE
1138        * OVERHEAD_FACTOR)
1139        .max(
1140            requirements.total_memory_bytes_at_desired as f64 * WORKLOAD_HEADROOM_FACTOR
1141                / desired_target_machines as f64,
1142        )
1143        .max(256.0 * MI as f64);
1144
1145    // Find the smallest instance that meets per-container targets within the cap.
1146    let selected = candidates
1147        .iter()
1148        .filter(|spec| {
1149            spec.vcpu <= vcpu_cap
1150                && spec.vcpu as f64 >= target_cpu
1151                && spec.memory_bytes as f64 >= target_memory
1152        })
1153        .min_by_key(|spec| spec.vcpu)
1154        .or_else(|| {
1155            // If nothing fits within the cap, pick the largest instance under the cap
1156            candidates
1157                .iter()
1158                .filter(|spec| spec.vcpu <= vcpu_cap)
1159                .max_by_key(|spec| spec.vcpu)
1160        })
1161        .or_else(|| {
1162            // Last resort: pick the smallest available instance (for GPU/storage)
1163            candidates.iter().min_by_key(|spec| spec.vcpu)
1164        })
1165        .ok_or_else(|| format!("no instance types available for platform {platform}"))?;
1166
1167    // Calculate machine counts
1168    let max_machines = compute_max_machines(requirements, selected);
1169    let min_machines = compute_min_machines(requirements, selected, max_machines);
1170
1171    Ok(InstanceSelection {
1172        instance_type: selected.name,
1173        profile: selected.to_machine_profile(),
1174        min_machines,
1175        max_machines,
1176    })
1177}
1178
1179/// Select instance family based on workload characteristics.
1180///
1181/// Uses GeneralPurpose for all standard workloads — widely available across
1182/// regions and cost-effective. Only specialized workloads (GPU, large ephemeral
1183/// storage) get specialized families. Very small workloads get burstable.
1184pub fn select_family(requirements: &WorkloadRequirements) -> InstanceFamily {
1185    // GPU workloads always get GPU instances
1186    if requirements.gpu.is_some() {
1187        return InstanceFamily::GpuCompute;
1188    }
1189
1190    // Large ephemeral storage needs NVMe (storage-optimized)
1191    if requirements.max_ephemeral_storage_bytes > STORAGE_OPTIMIZED_THRESHOLD {
1192        return InstanceFamily::StorageOptimized;
1193    }
1194
1195    // Very small workloads use burstable instances
1196    if requirements.total_cpu_at_max < 2.0 {
1197        return InstanceFamily::Burstable;
1198    }
1199
1200    // All other workloads use GeneralPurpose — available everywhere, good pricing
1201    InstanceFamily::GeneralPurpose
1202}
1203
1204/// Calculate maximum machines needed to fit the workload with headroom.
1205fn compute_max_machines(requirements: &WorkloadRequirements, instance: &InstanceTypeSpec) -> u32 {
1206    let cpu_with_headroom = requirements.total_cpu_at_max * WORKLOAD_HEADROOM_FACTOR;
1207    let cpu_machines = (cpu_with_headroom / allocatable_cpu(instance)).ceil() as u32;
1208
1209    let mem_with_headroom =
1210        requirements.total_memory_bytes_at_max as f64 * WORKLOAD_HEADROOM_FACTOR;
1211    let mem_machines =
1212        (mem_with_headroom / allocatable_memory_bytes(instance) as f64).ceil() as u32;
1213
1214    // Take the larger of CPU-based and memory-based, clamped to cluster limit
1215    cpu_machines
1216        .max(mem_machines)
1217        .max(1)
1218        .min(MAX_MACHINES_PER_CLUSTER)
1219}
1220
1221/// Calculate minimum machines for HA.
1222fn compute_min_machines(
1223    requirements: &WorkloadRequirements,
1224    instance: &InstanceTypeSpec,
1225    max_machines: u32,
1226) -> u32 {
1227    let cpu_with_headroom = requirements.total_cpu_at_desired * WORKLOAD_HEADROOM_FACTOR;
1228    let cpu_machines = (cpu_with_headroom / allocatable_cpu(instance)).ceil() as u32;
1229
1230    let mem_with_headroom =
1231        requirements.total_memory_bytes_at_desired as f64 * WORKLOAD_HEADROOM_FACTOR;
1232    let mem_machines =
1233        (mem_with_headroom / allocatable_memory_bytes(instance) as f64).ceil() as u32;
1234
1235    cpu_machines
1236        .max(mem_machines)
1237        .max(1)
1238        .min(2)
1239        .min(max_machines)
1240}
1241
1242fn desired_target_machines(requirements: &WorkloadRequirements) -> u32 {
1243    if requirements.total_cpu_at_desired >= 2.0
1244        || requirements.total_memory_bytes_at_desired >= 4 * GI
1245    {
1246        2
1247    } else {
1248        1
1249    }
1250}
1251
1252fn allocatable_cpu(instance: &InstanceTypeSpec) -> f64 {
1253    (instance.vcpu as f64 - SYSTEM_RESERVE_CPU).max(0.25)
1254}
1255
1256fn allocatable_memory_bytes(instance: &InstanceTypeSpec) -> u64 {
1257    instance
1258        .memory_bytes
1259        .saturating_sub(system_reserve_memory_bytes(instance.memory_bytes))
1260        .max(256 * MI)
1261}
1262
1263fn system_reserve_memory_bytes(memory_bytes: u64) -> u64 {
1264    if memory_bytes < 4 * GI {
1265        256 * MI
1266    } else if memory_bytes < 16 * GI {
1267        512 * MI
1268    } else {
1269        GI
1270    }
1271}
1272
1273// ---------------------------------------------------------------------------
1274// Tests
1275// ---------------------------------------------------------------------------
1276
1277#[cfg(test)]
1278mod tests {
1279    use super::*;
1280
1281    // -- Parsing tests --
1282
1283    #[test]
1284    fn test_parse_cpu_plain() {
1285        assert_eq!(parse_cpu("1").unwrap(), 1.0);
1286        assert_eq!(parse_cpu("0.5").unwrap(), 0.5);
1287        assert_eq!(parse_cpu("2.0").unwrap(), 2.0);
1288        assert_eq!(parse_cpu("16").unwrap(), 16.0);
1289    }
1290
1291    #[test]
1292    fn test_parse_cpu_millicore() {
1293        assert_eq!(parse_cpu("500m").unwrap(), 0.5);
1294        assert_eq!(parse_cpu("250m").unwrap(), 0.25);
1295        assert_eq!(parse_cpu("1000m").unwrap(), 1.0);
1296        assert_eq!(parse_cpu("100m").unwrap(), 0.1);
1297    }
1298
1299    #[test]
1300    fn test_parse_cpu_invalid() {
1301        assert!(parse_cpu("").is_err());
1302        assert!(parse_cpu("abc").is_err());
1303        assert!(parse_cpu("m").is_err());
1304    }
1305
1306    #[test]
1307    fn test_parse_memory_binary_suffixes() {
1308        assert_eq!(parse_memory_bytes("1Ki").unwrap(), 1024);
1309        assert_eq!(parse_memory_bytes("1Mi").unwrap(), 1024 * 1024);
1310        assert_eq!(parse_memory_bytes("1Gi").unwrap(), 1024 * 1024 * 1024);
1311        assert_eq!(parse_memory_bytes("4Gi").unwrap(), 4 * 1024 * 1024 * 1024);
1312        assert_eq!(parse_memory_bytes("512Mi").unwrap(), 512 * 1024 * 1024);
1313        assert_eq!(
1314            parse_memory_bytes("1Ti").unwrap(),
1315            1024u64 * 1024 * 1024 * 1024
1316        );
1317    }
1318
1319    #[test]
1320    fn test_parse_memory_decimal_suffixes() {
1321        assert_eq!(parse_memory_bytes("1k").unwrap(), 1000);
1322        assert_eq!(parse_memory_bytes("1M").unwrap(), 1_000_000);
1323        assert_eq!(parse_memory_bytes("1G").unwrap(), 1_000_000_000);
1324        assert_eq!(parse_memory_bytes("1T").unwrap(), 1_000_000_000_000);
1325    }
1326
1327    #[test]
1328    fn test_parse_memory_plain_bytes() {
1329        assert_eq!(parse_memory_bytes("1024").unwrap(), 1024);
1330        assert_eq!(parse_memory_bytes("0").unwrap(), 0);
1331    }
1332
1333    #[test]
1334    fn test_parse_memory_invalid() {
1335        assert!(parse_memory_bytes("").is_err());
1336        assert!(parse_memory_bytes("abc").is_err());
1337        assert!(parse_memory_bytes("Gi").is_err());
1338    }
1339
1340    #[test]
1341    fn test_parse_memory_fractional() {
1342        assert_eq!(parse_memory_bytes("0.5Gi").unwrap(), GI / 2);
1343        assert_eq!(parse_memory_bytes("1.5Gi").unwrap(), GI + GI / 2);
1344    }
1345
1346    // -- Catalog lookup tests --
1347
1348    #[test]
1349    fn test_catalog_has_entries_for_all_cloud_platforms() {
1350        assert!(!catalog_for_platform(Platform::Aws).is_empty());
1351        assert!(!catalog_for_platform(Platform::Gcp).is_empty());
1352        assert!(!catalog_for_platform(Platform::Azure).is_empty());
1353    }
1354
1355    #[test]
1356    fn test_catalog_no_entries_for_non_cloud_platforms() {
1357        assert!(catalog_for_platform(Platform::Local).is_empty());
1358        assert!(catalog_for_platform(Platform::Kubernetes).is_empty());
1359    }
1360
1361    #[test]
1362    fn test_find_known_instance_type() {
1363        let spec =
1364            find_instance_type(Platform::Aws, "m7g.2xlarge").expect("should find m7g.2xlarge");
1365        assert_eq!(spec.vcpu, 8);
1366        assert_eq!(spec.memory_bytes, 32 * GI);
1367        assert_eq!(spec.family, InstanceFamily::GeneralPurpose);
1368    }
1369
1370    #[test]
1371    fn test_find_unknown_instance_type() {
1372        assert!(find_instance_type(Platform::Aws, "nonexistent.xlarge").is_none());
1373    }
1374
1375    #[test]
1376    fn test_find_wrong_platform() {
1377        assert!(find_instance_type(Platform::Gcp, "m7g.2xlarge").is_none());
1378    }
1379
1380    #[test]
1381    fn test_to_machine_profile() {
1382        let spec = find_instance_type(Platform::Aws, "m7g.2xlarge").unwrap();
1383        let profile = spec.to_machine_profile();
1384        assert_eq!(profile.cpu, "8.0");
1385        assert_eq!(profile.memory_bytes, 32 * GI);
1386        assert_eq!(profile.ephemeral_storage_bytes, 20 * GI);
1387        assert!(profile.gpu.is_none());
1388    }
1389
1390    #[test]
1391    fn test_to_machine_profile_with_gpu() {
1392        let spec = find_instance_type(Platform::Aws, "p4d.24xlarge").unwrap();
1393        let profile = spec.to_machine_profile();
1394        let gpu = profile.gpu.as_ref().expect("should have GPU");
1395        assert_eq!(gpu.gpu_type, "nvidia-a100");
1396        assert_eq!(gpu.count, 8);
1397    }
1398
1399    // -- Selection algorithm tests --
1400
1401    #[test]
1402    fn test_select_burstable_for_small_workload() {
1403        let req = WorkloadRequirements {
1404            total_cpu_at_desired: 1.0,
1405            total_memory_bytes_at_desired: 2 * GI,
1406            total_cpu_at_max: 1.0,
1407            total_memory_bytes_at_max: 2 * GI,
1408            max_cpu_per_container: 0.5,
1409            max_memory_per_container: 1 * GI,
1410            max_ephemeral_storage_bytes: 10 * GI,
1411            gpu: None,
1412        };
1413        let sel = select_instance_type(Platform::Aws, &req).unwrap();
1414        let spec = find_instance_type(Platform::Aws, sel.instance_type).unwrap();
1415        assert_eq!(spec.family, InstanceFamily::Burstable);
1416    }
1417
1418    #[test]
1419    fn test_select_general_purpose_for_standard_workload() {
1420        // Standard workloads always get GeneralPurpose regardless of CPU:memory ratio
1421        let req = WorkloadRequirements {
1422            total_cpu_at_desired: 20.0,
1423            total_memory_bytes_at_desired: 80 * GI,
1424            total_cpu_at_max: 20.0,
1425            total_memory_bytes_at_max: 80 * GI,
1426            max_cpu_per_container: 2.0,
1427            max_memory_per_container: 8 * GI,
1428            max_ephemeral_storage_bytes: 10 * GI,
1429            gpu: None,
1430        };
1431        let sel = select_instance_type(Platform::Aws, &req).unwrap();
1432        let spec = find_instance_type(Platform::Aws, sel.instance_type).unwrap();
1433        assert_eq!(spec.family, InstanceFamily::GeneralPurpose);
1434    }
1435
1436    #[test]
1437    fn test_select_general_purpose_even_for_cpu_heavy() {
1438        // CPU-heavy workloads still get GeneralPurpose (no more ComputeOptimized auto-select)
1439        let req = WorkloadRequirements {
1440            total_cpu_at_desired: 20.0,
1441            total_memory_bytes_at_desired: 20 * GI,
1442            total_cpu_at_max: 20.0,
1443            total_memory_bytes_at_max: 20 * GI,
1444            max_cpu_per_container: 2.0,
1445            max_memory_per_container: 2 * GI,
1446            max_ephemeral_storage_bytes: 10 * GI,
1447            gpu: None,
1448        };
1449        let sel = select_instance_type(Platform::Aws, &req).unwrap();
1450        let spec = find_instance_type(Platform::Aws, sel.instance_type).unwrap();
1451        assert_eq!(spec.family, InstanceFamily::GeneralPurpose);
1452    }
1453
1454    #[test]
1455    fn test_select_storage_optimized_for_large_ephemeral() {
1456        let req = WorkloadRequirements {
1457            total_cpu_at_desired: 8.0,
1458            total_memory_bytes_at_desired: 32 * GI,
1459            total_cpu_at_max: 8.0,
1460            total_memory_bytes_at_max: 32 * GI,
1461            max_cpu_per_container: 2.0,
1462            max_memory_per_container: 8 * GI,
1463            max_ephemeral_storage_bytes: 500 * GI,
1464            gpu: None,
1465        };
1466        let sel = select_instance_type(Platform::Aws, &req).unwrap();
1467        let spec = find_instance_type(Platform::Aws, sel.instance_type).unwrap();
1468        assert_eq!(spec.family, InstanceFamily::StorageOptimized);
1469    }
1470
1471    #[test]
1472    fn test_select_gpu_instance() {
1473        let req = WorkloadRequirements {
1474            total_cpu_at_desired: 8.0,
1475            total_memory_bytes_at_desired: 32 * GI,
1476            total_cpu_at_max: 8.0,
1477            total_memory_bytes_at_max: 32 * GI,
1478            max_cpu_per_container: 4.0,
1479            max_memory_per_container: 16 * GI,
1480            max_ephemeral_storage_bytes: 10 * GI,
1481            gpu: Some(GpuSpec {
1482                gpu_type: "nvidia-a100".to_string(),
1483                count: 1,
1484            }),
1485        };
1486        let sel = select_instance_type(Platform::Aws, &req).unwrap();
1487        let spec = find_instance_type(Platform::Aws, sel.instance_type).unwrap();
1488        assert_eq!(spec.family, InstanceFamily::GpuCompute);
1489        assert!(spec.gpu.is_some());
1490    }
1491
1492    #[test]
1493    fn test_select_works_for_all_cloud_platforms() {
1494        let req = WorkloadRequirements {
1495            total_cpu_at_desired: 4.0,
1496            total_memory_bytes_at_desired: 16 * GI,
1497            total_cpu_at_max: 4.0,
1498            total_memory_bytes_at_max: 16 * GI,
1499            max_cpu_per_container: 1.0,
1500            max_memory_per_container: 4 * GI,
1501            max_ephemeral_storage_bytes: 10 * GI,
1502            gpu: None,
1503        };
1504        for platform in [Platform::Aws, Platform::Gcp, Platform::Azure] {
1505            let sel = select_instance_type(platform, &req);
1506            assert!(sel.is_ok(), "selection failed for {platform}");
1507        }
1508    }
1509
1510    #[test]
1511    fn test_machine_count_reasonable() {
1512        // Single container: 1 CPU, 2Gi, maxReplicas=20
1513        let req = WorkloadRequirements {
1514            total_cpu_at_desired: 20.0,
1515            total_memory_bytes_at_desired: 40 * GI,
1516            total_cpu_at_max: 20.0,
1517            total_memory_bytes_at_max: 40 * GI,
1518            max_cpu_per_container: 1.0,
1519            max_memory_per_container: 2 * GI,
1520            max_ephemeral_storage_bytes: 10 * GI,
1521            gpu: None,
1522        };
1523        let sel = select_instance_type(Platform::Aws, &req).unwrap();
1524        assert!(sel.min_machines >= 1);
1525        assert!(sel.max_machines <= MAX_MACHINES_PER_CLUSTER);
1526        assert!(sel.max_machines >= sel.min_machines);
1527    }
1528
1529    #[test]
1530    fn test_instance_size_capped_at_8_vcpu() {
1531        // Even with very large containers, instance size is capped at 8 vCPUs
1532        let req = WorkloadRequirements {
1533            total_cpu_at_desired: 70.0,
1534            total_memory_bytes_at_desired: 140 * GI,
1535            total_cpu_at_max: 70.0,
1536            total_memory_bytes_at_max: 140 * GI,
1537            max_cpu_per_container: 2.0,
1538            max_memory_per_container: 4 * GI,
1539            max_ephemeral_storage_bytes: 10 * GI,
1540            gpu: None,
1541        };
1542        let sel = select_instance_type(Platform::Gcp, &req).unwrap();
1543        let spec = find_instance_type(Platform::Gcp, sel.instance_type).unwrap();
1544        assert!(
1545            spec.vcpu <= MAX_STANDARD_VCPU,
1546            "selected {} with {} vCPUs, expected <= {}",
1547            spec.name,
1548            spec.vcpu,
1549            MAX_STANDARD_VCPU
1550        );
1551        assert_eq!(spec.family, InstanceFamily::GeneralPurpose);
1552        // Should scale horizontally instead
1553        assert!(sel.max_machines > 1);
1554    }
1555
1556    #[test]
1557    fn test_larger_autoscaled_workload_gets_reasonable_instance() {
1558        // Simulates a larger autoscaled workload: 4 containers, each 2 CPU / 4 GiB
1559        // maxReplicas: 10, 10, 10, 5
1560        let req = WorkloadRequirements {
1561            total_cpu_at_desired: 70.0,
1562            total_memory_bytes_at_desired: 140 * GI,
1563            total_cpu_at_max: 70.0,              // 2*10 + 2*10 + 2*10 + 2*5
1564            total_memory_bytes_at_max: 140 * GI, // 4*10 + 4*10 + 4*10 + 4*5
1565            max_cpu_per_container: 2.0,
1566            max_memory_per_container: 4 * GI,
1567            max_ephemeral_storage_bytes: 20 * GI,
1568            gpu: None,
1569        };
1570        let sel = select_instance_type(Platform::Gcp, &req).unwrap();
1571        // Should pick n2-standard-8 (8 vCPU, 32 GiB) — NOT c3-standard-44
1572        assert_eq!(sel.instance_type, "n2-standard-8");
1573        assert!(sel.max_machines >= 2);
1574    }
1575
1576    #[test]
1577    fn test_profile_has_required_fields() {
1578        let req = WorkloadRequirements {
1579            total_cpu_at_desired: 4.0,
1580            total_memory_bytes_at_desired: 16 * GI,
1581            total_cpu_at_max: 4.0,
1582            total_memory_bytes_at_max: 16 * GI,
1583            max_cpu_per_container: 1.0,
1584            max_memory_per_container: 4 * GI,
1585            max_ephemeral_storage_bytes: 10 * GI,
1586            gpu: None,
1587        };
1588        let sel = select_instance_type(Platform::Aws, &req).unwrap();
1589        assert!(!sel.profile.cpu.is_empty());
1590        assert!(sel.profile.memory_bytes > 0);
1591        assert!(sel.profile.ephemeral_storage_bytes > 0);
1592    }
1593
1594    #[test]
1595    fn test_error_for_unsupported_gpu_type() {
1596        let req = WorkloadRequirements {
1597            total_cpu_at_desired: 8.0,
1598            total_memory_bytes_at_desired: 32 * GI,
1599            total_cpu_at_max: 8.0,
1600            total_memory_bytes_at_max: 32 * GI,
1601            max_cpu_per_container: 4.0,
1602            max_memory_per_container: 16 * GI,
1603            max_ephemeral_storage_bytes: 10 * GI,
1604            gpu: Some(GpuSpec {
1605                gpu_type: "amd-mi300".to_string(),
1606                count: 1,
1607            }),
1608        };
1609        let result = select_instance_type(Platform::Aws, &req);
1610        assert!(result.is_err());
1611    }
1612
1613    #[test]
1614    fn test_catalog_instance_types_sorted_by_vcpu_within_family() {
1615        // Verify that within each (platform, family) group, vcpu is non-decreasing.
1616        // This ensures our "min_by_key(vcpu)" logic works correctly.
1617        for platform in [Platform::Aws, Platform::Gcp, Platform::Azure] {
1618            let entries = catalog_for_platform(platform);
1619            let mut by_family: std::collections::HashMap<_, Vec<_>> =
1620                std::collections::HashMap::new();
1621            for entry in entries {
1622                by_family
1623                    .entry(format!("{:?}", entry.family))
1624                    .or_default()
1625                    .push(entry);
1626            }
1627            for (family, instances) in &by_family {
1628                for window in instances.windows(2) {
1629                    assert!(
1630                        window[0].vcpu <= window[1].vcpu,
1631                        "catalog not sorted by vcpu for {platform}/{family}: {} ({}) > {} ({})",
1632                        window[0].name,
1633                        window[0].vcpu,
1634                        window[1].name,
1635                        window[1].vcpu
1636                    );
1637                }
1638            }
1639        }
1640    }
1641}
alien_core/instance_catalog.rs

alien_core/
instance_catalog.rs