use serde::{Deserialize, Serialize};
use super::gpu::{GpuMonitor, VramStatus, GpuVendor};
use std::collections::HashMap;
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
pub enum DeviceType {
CPU,
GPU(u32), NPU,
RAM,
Storage,
}
impl std::fmt::Display for DeviceType {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
match self {
DeviceType::CPU => write!(f, "CPU"),
DeviceType::GPU(idx) => write!(f, "GPU:{}", idx),
DeviceType::NPU => write!(f, "NPU"),
DeviceType::RAM => write!(f, "RAM"),
DeviceType::Storage => write!(f, "Storage"),
}
}
}
#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Serialize, Deserialize)]
pub enum MemoryTier {
Cache,
VRAM,
RAM,
Storage,
}
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
pub enum PlacementStrategy {
GPUFirst,
Balanced,
LatencyOptimized,
PowerEfficient,
ThroughputOptimized,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct ResourceAllocation {
pub device: DeviceType,
pub memory_bytes: u64,
pub compute_percent: u32,
pub priority: u32,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct PlacementPlan {
pub model_id: String,
pub layers: Vec<LayerAllocation>,
pub kv_cache: ResourceAllocation,
pub estimated_vram_mb: u64,
pub estimated_ram_mb: u64,
pub estimated_latency_ms: u32,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct LayerAllocation {
pub layer_index: u32,
pub layer_name: String,
pub device: DeviceType,
pub memory_bytes: u64,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct SystemResources {
pub total_ram: u64,
pub available_ram: u64,
pub gpu_vram: Vec<VramStatus>,
pub cpu_utilization: f32,
pub cpu_temp: Option<f32>,
}
pub struct ResourceBridge {
gpu_monitor: Option<GpuMonitor>,
strategy: PlacementStrategy,
vram_reserve_percent: u32,
ram_reserve_bytes: u64,
allocations: HashMap<String, PlacementPlan>,
}
impl ResourceBridge {
pub fn new(strategy: PlacementStrategy) -> Self {
Self {
gpu_monitor: GpuMonitor::new().ok(),
strategy,
vram_reserve_percent: 5,
ram_reserve_bytes: 4 * 1024 * 1024 * 1024, allocations: HashMap::new(),
}
}
pub fn set_vram_reserve(&mut self, percent: u32) {
self.vram_reserve_percent = percent.min(50);
}
pub fn set_ram_reserve_gb(&mut self, gb: u64) {
self.ram_reserve_bytes = gb * 1024 * 1024 * 1024;
}
pub fn get_resources(&self) -> SystemResources {
let gpu_vram = self.gpu_monitor
.as_ref()
.map(|m| m.status())
.unwrap_or_default();
let (total_ram, available_ram) = self.get_system_memory();
SystemResources {
total_ram,
available_ram,
gpu_vram,
cpu_utilization: self.get_cpu_utilization(),
cpu_temp: None, }
}
#[cfg(windows)]
fn get_system_memory(&self) -> (u64, u64) {
use windows::Win32::System::SystemInformation::{
GlobalMemoryStatusEx, MEMORYSTATUSEX,
};
unsafe {
let mut status = MEMORYSTATUSEX {
dwLength: std::mem::size_of::<MEMORYSTATUSEX>() as u32,
..Default::default()
};
if GlobalMemoryStatusEx(&mut status).is_ok() {
return (status.ullTotalPhys, status.ullAvailPhys);
}
}
(0, 0)
}
#[cfg(not(windows))]
fn get_system_memory(&self) -> (u64, u64) {
(0, 0)
}
#[cfg(windows)]
fn get_cpu_utilization(&self) -> f32 {
0.0
}
#[cfg(not(windows))]
fn get_cpu_utilization(&self) -> f32 {
0.0
}
pub fn available_vram(&self) -> u64 {
let resources = self.get_resources();
let total_vram: u64 = resources.gpu_vram.iter().map(|v| v.total).sum();
let used_vram: u64 = resources.gpu_vram.iter().map(|v| v.used).sum();
let reserved = (total_vram as f64 * self.vram_reserve_percent as f64 / 100.0) as u64;
total_vram.saturating_sub(used_vram).saturating_sub(reserved)
}
pub fn available_ram(&self) -> u64 {
let resources = self.get_resources();
resources.available_ram.saturating_sub(self.ram_reserve_bytes)
}
pub fn plan_placement(
&self,
model_id: &str,
model_size_bytes: u64,
num_layers: u32,
kv_cache_bytes: u64,
) -> PlacementPlan {
let available_vram = self.available_vram();
let _available_ram = self.available_ram();
let layer_size = model_size_bytes / num_layers as u64;
let mut layers = Vec::new();
let mut vram_used: u64 = 0;
let mut ram_used: u64 = 0;
let vram_for_layers = available_vram.saturating_sub(kv_cache_bytes);
let max_gpu_layers = (vram_for_layers / layer_size) as u32;
let gpu_layers = match self.strategy {
PlacementStrategy::GPUFirst => max_gpu_layers.min(num_layers),
PlacementStrategy::Balanced => (max_gpu_layers / 2).min(num_layers),
PlacementStrategy::LatencyOptimized => max_gpu_layers.min(num_layers),
PlacementStrategy::PowerEfficient => (max_gpu_layers / 3).min(num_layers),
PlacementStrategy::ThroughputOptimized => max_gpu_layers.min(num_layers),
};
for i in 0..num_layers {
let device = if i < gpu_layers {
vram_used += layer_size;
DeviceType::GPU(0)
} else {
ram_used += layer_size;
DeviceType::RAM
};
layers.push(LayerAllocation {
layer_index: i,
layer_name: format!("layer_{}", i),
device,
memory_bytes: layer_size,
});
}
let kv_device = if vram_used + kv_cache_bytes <= available_vram {
vram_used += kv_cache_bytes;
DeviceType::GPU(0)
} else {
ram_used += kv_cache_bytes;
DeviceType::RAM
};
let gpu_ratio = gpu_layers as f64 / num_layers as f64;
let base_latency = 50; let estimated_latency = ((1.0 - gpu_ratio * 0.8) * base_latency as f64) as u32;
PlacementPlan {
model_id: model_id.to_string(),
layers,
kv_cache: ResourceAllocation {
device: kv_device,
memory_bytes: kv_cache_bytes,
compute_percent: 0,
priority: 1,
},
estimated_vram_mb: vram_used / (1024 * 1024),
estimated_ram_mb: ram_used / (1024 * 1024),
estimated_latency_ms: estimated_latency,
}
}
pub fn needs_rebalance(&self) -> bool {
let resources = self.get_resources();
for vram in &resources.gpu_vram {
if vram.usage_percent() > 95.0 {
return true;
}
}
let ram_usage = 100.0 - (resources.available_ram as f64 / resources.total_ram as f64 * 100.0);
if ram_usage > 90.0 {
return true;
}
false
}
pub fn suggest_offload(&self) -> Vec<OffloadSuggestion> {
let mut suggestions = Vec::new();
let resources = self.get_resources();
for (i, vram) in resources.gpu_vram.iter().enumerate() {
if vram.usage_percent() > 90.0 {
let to_offload = vram.used.saturating_sub(
(vram.total as f64 * 0.8) as u64
);
suggestions.push(OffloadSuggestion {
from: DeviceType::GPU(i as u32),
to: DeviceType::RAM,
bytes: to_offload,
reason: format!(
"GPU {} at {:.1}% VRAM usage",
i,
vram.usage_percent()
),
});
}
}
suggestions
}
pub fn register_allocation(&mut self, plan: PlacementPlan) {
self.allocations.insert(plan.model_id.clone(), plan);
}
pub fn remove_allocation(&mut self, model_id: &str) {
self.allocations.remove(model_id);
}
pub fn active_allocations(&self) -> &HashMap<String, PlacementPlan> {
&self.allocations
}
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct OffloadSuggestion {
pub from: DeviceType,
pub to: DeviceType,
pub bytes: u64,
pub reason: String,
}
impl std::fmt::Display for OffloadSuggestion {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(
f,
"Offload {} MB from {} to {}: {}",
self.bytes / (1024 * 1024),
self.from,
self.to,
self.reason
)
}
}
impl std::fmt::Display for PlacementPlan {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
writeln!(f, "Placement Plan for {}", self.model_id)?;
writeln!(f, " VRAM: {} MB", self.estimated_vram_mb)?;
writeln!(f, " RAM: {} MB", self.estimated_ram_mb)?;
writeln!(f, " Est. Latency: {} ms/token", self.estimated_latency_ms)?;
let gpu_layers = self.layers.iter()
.filter(|l| matches!(l.device, DeviceType::GPU(_)))
.count();
let cpu_layers = self.layers.len() - gpu_layers;
writeln!(f, " Layers: {} GPU, {} CPU", gpu_layers, cpu_layers)?;
writeln!(f, " KV Cache: {}", self.kv_cache.device)?;
Ok(())
}
}
impl Default for ResourceBridge {
fn default() -> Self {
Self::new(PlacementStrategy::GPUFirst)
}
}