inference_lab/config/hardware.rs
1use serde::Deserialize;
2
3fn default_gpu_memory_utilization() -> f64 {
4 0.9
5}
6
7#[derive(Debug, Clone, Deserialize)]
8pub struct HardwareConfig {
9 /// Accelerator name (e.g., "H100", "A100")
10 pub name: String,
11
12 /// Compute capacity in FLOPS (for specific precision, e.g., bf16)
13 pub compute_flops: f64,
14
15 /// Memory bandwidth in bytes/sec
16 pub memory_bandwidth: f64,
17
18 /// Total memory capacity in bytes
19 pub memory_capacity: u64,
20
21 /// KV cache capacity in bytes (subset of memory_capacity)
22 /// If not specified, calculated from gpu_memory_utilization
23 #[serde(default)]
24 pub kv_cache_capacity: u64,
25
26 /// Fraction of GPU memory to use (vLLM default: 0.9)
27 /// Used to calculate kv_cache_capacity if not explicitly set
28 #[serde(default = "default_gpu_memory_utilization")]
29 pub gpu_memory_utilization: f64,
30
31 /// Number of bytes per parameter (1 for fp8, 2 for bf16)
32 pub bytes_per_param: u32,
33}
34
35impl HardwareConfig {
36 /// Calculate KV cache capacity if not explicitly set
37 /// Formula: (memory_capacity * gpu_memory_utilization) - model_size
38 /// This matches vLLM's behavior: requested_memory - non_kv_cache_memory
39 pub fn compute_kv_cache_capacity(&mut self, model_size_bytes: u64) {
40 if self.kv_cache_capacity == 0 {
41 let requested_memory =
42 (self.memory_capacity as f64 * self.gpu_memory_utilization) as u64;
43 // In vLLM, non_kv_cache_memory includes weights + activations + overhead
44 // For simplicity, we approximate this as just the model weights
45 self.kv_cache_capacity = requested_memory.saturating_sub(model_size_bytes);
46 }
47 }
48}