inference_lab/config/
hardware.rs

1use serde::Deserialize;
2
3fn default_gpu_memory_utilization() -> f64 {
4    0.9
5}
6
7#[derive(Debug, Clone, Deserialize)]
8pub struct HardwareConfig {
9    /// Accelerator name (e.g., "H100", "A100")
10    pub name: String,
11
12    /// Compute capacity in FLOPS (for specific precision, e.g., bf16)
13    pub compute_flops: f64,
14
15    /// Memory bandwidth in bytes/sec
16    pub memory_bandwidth: f64,
17
18    /// Total memory capacity in bytes
19    pub memory_capacity: u64,
20
21    /// KV cache capacity in bytes (subset of memory_capacity)
22    /// If not specified, calculated from gpu_memory_utilization
23    #[serde(default)]
24    pub kv_cache_capacity: u64,
25
26    /// Fraction of GPU memory to use (vLLM default: 0.9)
27    /// Used to calculate kv_cache_capacity if not explicitly set
28    #[serde(default = "default_gpu_memory_utilization")]
29    pub gpu_memory_utilization: f64,
30
31    /// Number of bytes per parameter (1 for fp8, 2 for bf16)
32    pub bytes_per_param: u32,
33}
34
35impl HardwareConfig {
36    /// Calculate KV cache capacity if not explicitly set
37    /// Formula: (memory_capacity * gpu_memory_utilization) - model_size
38    /// This matches vLLM's behavior: requested_memory - non_kv_cache_memory
39    pub fn compute_kv_cache_capacity(&mut self, model_size_bytes: u64) {
40        if self.kv_cache_capacity == 0 {
41            let requested_memory =
42                (self.memory_capacity as f64 * self.gpu_memory_utilization) as u64;
43            // In vLLM, non_kv_cache_memory includes weights + activations + overhead
44            // For simplicity, we approximate this as just the model weights
45            self.kv_cache_capacity = requested_memory.saturating_sub(model_size_bytes);
46        }
47    }
48}
inference_lab/config/hardware.rs

inference_lab/config/
hardware.rs