use crate::eviction::select_evictions;
use crate::state::ClusterState;
use crate::types::*;
pub fn route_request(
model_id: &ModelId,
cluster: &ClusterState,
config: &RoutingConfig,
) -> RoutingDecision {
if let Some((gpu_id, port)) = cluster.find_loaded_model(model_id) {
if let Some(gpu) = cluster.gpus.iter().find(|g| g.id == gpu_id) {
if gpu.health != GpuHealth::Unhealthy {
return RoutingDecision::RouteToLoaded { gpu_id, port };
}
}
}
let spec = match cluster.get_model_spec(model_id) {
Some(spec) => spec,
None => {
return RoutingDecision::Reject {
reason: format!("Model '{}' not found in registry", model_id),
};
}
};
let is_pinned = |id: &ModelId| {
cluster
.model_registry
.get(id)
.map(|spec| spec.pin)
.unwrap_or(false)
};
let candidates = cluster.gpus_with_space_for(spec.vram_required, config.vram_headroom);
if let Some(best_gpu) = select_best_gpu(&candidates, config) {
return RoutingDecision::LoadModel {
gpu_id: best_gpu.id,
evict: vec![],
};
}
let healthy_gpus = cluster.healthy_gpus_by_available_memory();
for gpu in &healthy_gpus {
if let Some(evictions) = select_evictions(
gpu,
spec.vram_required,
config.vram_headroom,
config.eviction_policy,
is_pinned,
) {
if !evictions.is_empty() {
return RoutingDecision::LoadModel {
gpu_id: gpu.id,
evict: evictions,
};
}
}
}
let pinned_blocker = healthy_gpus
.iter()
.any(|gpu| gpu.loaded_models.iter().any(|m| is_pinned(&m.model_id)));
if pinned_blocker {
return RoutingDecision::Reject {
reason: format!(
"Cannot load model '{}' (requires {}): all eligible eviction candidates are pinned",
model_id, spec.vram_required
),
};
}
RoutingDecision::Reject {
reason: format!(
"No GPU can fit model '{}' (requires {}), even after eviction",
model_id, spec.vram_required
),
}
}
fn select_best_gpu<'a>(
candidates: &[&'a GpuState],
_config: &RoutingConfig,
) -> Option<&'a GpuState> {
if candidates.is_empty() {
return None;
}
let mut sorted: Vec<&&GpuState> = candidates.iter().collect();
sorted.sort_by(|a, b| {
let health_ord = health_priority(a.health).cmp(&health_priority(b.health));
if health_ord != std::cmp::Ordering::Equal {
return health_ord;
}
b.memory_available.cmp(&a.memory_available)
});
sorted.first().copied().copied()
}
fn health_priority(health: GpuHealth) -> u8 {
match health {
GpuHealth::Healthy => 0,
GpuHealth::Degraded => 1,
GpuHealth::Unhealthy => 2,
}
}
#[cfg(test)]
mod tests {
use super::*;
use crate::state::test_helpers::*;
use bytesize::ByteSize;
use chrono::{Duration, Utc};
use std::collections::HashMap;
fn make_registry(specs: Vec<(&str, u64)>) -> HashMap<ModelId, ModelSpec> {
specs
.into_iter()
.map(|(id, vram_gb)| test_model_spec(id, vram_gb))
.collect()
}
fn make_registry_with_pins(specs: Vec<(&str, u64, bool)>) -> HashMap<ModelId, ModelSpec> {
specs
.into_iter()
.map(|(id, vram_gb, pin)| {
let (model_id, mut spec) = test_model_spec(id, vram_gb);
spec.pin = pin;
(model_id, spec)
})
.collect()
}
fn default_config() -> RoutingConfig {
RoutingConfig {
vram_headroom: ByteSize::gb(1),
..Default::default()
}
}
#[test]
fn routes_to_already_loaded_model() {
let gpus = vec![GpuStateBuilder::new(0)
.memory_total_gb(24)
.with_model("model-a", 8, 8001)
.build()];
let registry = make_registry(vec![("model-a", 8)]);
let cluster = ClusterState::new(gpus, registry);
let decision = route_request(&"model-a".into(), &cluster, &default_config());
assert!(matches!(
decision,
RoutingDecision::RouteToLoaded {
gpu_id: GpuId(0),
port: 8001
}
));
}
#[test]
fn loads_model_on_gpu_with_space() {
let gpus = vec![
GpuStateBuilder::new(0).memory_total_gb(24).build(),
GpuStateBuilder::new(1).memory_total_gb(24).build(),
];
let registry = make_registry(vec![("model-a", 8)]);
let cluster = ClusterState::new(gpus, registry);
let decision = route_request(&"model-a".into(), &cluster, &default_config());
assert!(matches!(
decision,
RoutingDecision::LoadModel { evict, .. } if evict.is_empty()
));
}
#[test]
fn evicts_when_no_space_available() {
let old = Utc::now() - Duration::hours(2);
let recent = Utc::now() - Duration::minutes(5);
let gpus = vec![GpuStateBuilder::new(0)
.memory_total_gb(24)
.with_model_last_used("old-model", 12, 8001, old)
.with_model_last_used("recent-model", 10, 8002, recent)
.build()];
let registry = make_registry(vec![
("old-model", 12),
("recent-model", 10),
("new-model", 10),
]);
let cluster = ClusterState::new(gpus, registry);
let decision = route_request(&"new-model".into(), &cluster, &default_config());
match decision {
RoutingDecision::LoadModel { gpu_id, evict } => {
assert_eq!(gpu_id, GpuId(0));
assert!(evict.contains(&ModelId("old-model".into())));
}
other => panic!("Expected LoadModel, got {:?}", other),
}
}
#[test]
fn rejects_unknown_model() {
let gpus = vec![GpuStateBuilder::new(0).memory_total_gb(24).build()];
let registry = make_registry(vec![]);
let cluster = ClusterState::new(gpus, registry);
let decision = route_request(&"nonexistent".into(), &cluster, &default_config());
assert!(matches!(decision, RoutingDecision::Reject { .. }));
}
#[test]
fn rejects_model_too_large_for_any_gpu() {
let gpus = vec![
GpuStateBuilder::new(0).memory_total_gb(24).build(),
GpuStateBuilder::new(1).memory_total_gb(24).build(),
];
let registry = make_registry(vec![("huge-model", 48)]);
let cluster = ClusterState::new(gpus, registry);
let decision = route_request(&"huge-model".into(), &cluster, &default_config());
assert!(matches!(decision, RoutingDecision::Reject { .. }));
}
#[test]
fn avoids_unhealthy_gpu() {
let gpus = vec![
GpuStateBuilder::new(0)
.memory_total_gb(24)
.health(GpuHealth::Unhealthy)
.build(),
GpuStateBuilder::new(1)
.memory_total_gb(24)
.health(GpuHealth::Healthy)
.build(),
];
let registry = make_registry(vec![("model-a", 8)]);
let cluster = ClusterState::new(gpus, registry);
let decision = route_request(&"model-a".into(), &cluster, &default_config());
match decision {
RoutingDecision::LoadModel { gpu_id, .. } => {
assert_eq!(gpu_id, GpuId(1), "Should route to healthy GPU");
}
other => panic!("Expected LoadModel, got {:?}", other),
}
}
#[test]
fn prefers_healthy_over_degraded() {
let gpus = vec![
GpuStateBuilder::new(0)
.memory_total_gb(48)
.health(GpuHealth::Degraded)
.build(),
GpuStateBuilder::new(1)
.memory_total_gb(24)
.health(GpuHealth::Healthy)
.build(),
];
let registry = make_registry(vec![("model-a", 8)]);
let cluster = ClusterState::new(gpus, registry);
let decision = route_request(&"model-a".into(), &cluster, &default_config());
match decision {
RoutingDecision::LoadModel { gpu_id, .. } => {
assert_eq!(
gpu_id,
GpuId(1),
"Should prefer healthy GPU even with less memory"
);
}
other => panic!("Expected LoadModel, got {:?}", other),
}
}
#[test]
fn rejects_with_pin_reason_when_only_pinned_models_block_load() {
let gpus = vec![GpuStateBuilder::new(0)
.memory_total_gb(24)
.with_model("pinned-big", 20, 8001)
.build()];
let registry =
make_registry_with_pins(vec![("pinned-big", 20, true), ("new-model", 10, false)]);
let cluster = ClusterState::new(gpus, registry);
let decision = route_request(&"new-model".into(), &cluster, &default_config());
match decision {
RoutingDecision::Reject { reason } => {
assert!(
reason.contains("pinned"),
"expected pin-specific reject reason, got: {reason}"
);
}
other => panic!("expected Reject with pinned reason, got {:?}", other),
}
}
#[test]
fn evicts_non_pinned_model_even_if_pinned_is_lru() {
let old = Utc::now() - Duration::hours(10);
let recent = Utc::now() - Duration::minutes(1);
let gpus = vec![GpuStateBuilder::new(0)
.memory_total_gb(24)
.with_model_last_used("pinned-stale", 10, 8001, old)
.with_model_last_used("recent", 10, 8002, recent)
.build()];
let registry = make_registry_with_pins(vec![
("pinned-stale", 10, true),
("recent", 10, false),
("new-model", 10, false),
]);
let cluster = ClusterState::new(gpus, registry);
let decision = route_request(&"new-model".into(), &cluster, &default_config());
match decision {
RoutingDecision::LoadModel { gpu_id, evict } => {
assert_eq!(gpu_id, GpuId(0));
assert_eq!(evict, vec![ModelId("recent".into())]);
}
other => panic!("expected LoadModel evicting 'recent', got {:?}", other),
}
}
#[test]
fn does_not_route_to_unhealthy_gpu_even_if_model_loaded_there() {
let gpus = vec![
GpuStateBuilder::new(0)
.memory_total_gb(24)
.health(GpuHealth::Unhealthy)
.with_model("model-a", 8, 8001)
.build(),
GpuStateBuilder::new(1)
.memory_total_gb(24)
.health(GpuHealth::Healthy)
.build(),
];
let registry = make_registry(vec![("model-a", 8)]);
let cluster = ClusterState::new(gpus, registry);
let decision = route_request(&"model-a".into(), &cluster, &default_config());
assert!(
!matches!(
decision,
RoutingDecision::RouteToLoaded {
gpu_id: GpuId(0),
..
}
),
"Should not route to unhealthy GPU"
);
}
}