llmux/lib.rs
1//! # llmux
2//!
3//! Zero-reload model switching for vLLM - manages multiple models on shared GPU.
4//!
5//! This crate provides:
6//! - **Orchestrator**: Lazily starts vLLM processes on first request
7//! - **Switcher**: Coordinates wake/sleep between models
8//! - **Middleware**: Axum layer that integrates with onwards proxy
9//!
10//! ## Architecture
11//!
12//! ```text
13//! ┌─────────────────────────────────────────────────────────────┐
14//! │ llmux │
15//! │ ┌─────────────────────────────────────────────────────┐ │
16//! │ │ Orchestrator │ │
17//! │ │ - Spawns vLLM processes lazily │ │
18//! │ │ - Tracks: NotStarted | Starting | Running | Sleeping │ │
19//! │ └─────────────────────────────────────────────────────┘ │
20//! │ │ │
21//! │ ┌─────────────────────────────────────────────────────┐ │
22//! │ │ Middleware Layer │ │
23//! │ │ - Extracts model from request │ │
24//! │ │ - Ensures model ready before forwarding │ │
25//! │ └─────────────────────────────────────────────────────┘ │
26//! │ │ │
27//! │ ┌─────────────────────────────────────────────────────┐ │
28//! │ │ Onwards Proxy │ │
29//! │ │ - Routes to vLLM by model name │ │
30//! │ └─────────────────────────────────────────────────────┘ │
31//! │ │ │
32//! │ ┌───────────────────┼───────────────────┐ │
33//! │ ▼ ▼ ▼ │
34//! │ [vLLM:8001] [vLLM:8002] [vLLM:8003] │
35//! │ (llama) (mistral) (qwen) │
36//! └─────────────────────────────────────────────────────────────┘
37//! ```
38
39mod config;
40pub mod control;
41mod middleware;
42pub mod object_store;
43mod orchestrator;
44mod policy;
45mod switcher;
46pub(crate) mod types;
47pub mod validate;
48
49pub use config::{CheckpointConfig, Config, ModelConfig, ObjectStoreConfig, PolicyConfig};
50pub use middleware::{ModelSwitcherLayer, ModelSwitcherService};
51pub use orchestrator::{Orchestrator, OrchestratorError, ProcessState};
52pub use policy::{
53 CostAwarePolicy, FifoPolicy, PolicyContext, PolicyDecision, ScheduleContext, SwitchPolicy,
54 TimeSlicePolicy,
55};
56pub use switcher::{InFlightGuard, ModelSwitcher};
57pub use types::{
58 EvictionPolicy, ProcessStrategy, SwitchError, SwitchMode, SwitcherState, WeightStrategy,
59};
60
61use anyhow::Result;
62use std::sync::Arc;
63use tracing::info;
64
65/// Build the complete llmux stack
66///
67/// Returns:
68/// - The main Axum router (proxy + middleware)
69/// - An optional metrics router (when `config.metrics_port > 0`)
70/// - The control API router (for the admin port)
71/// - The model switcher (for driving warmup before serving)
72pub async fn build_app(
73 config: Config,
74) -> Result<(
75 axum::Router,
76 Option<axum::Router>,
77 axum::Router,
78 ModelSwitcher,
79)> {
80 info!("Building llmux with {} models", config.models.len());
81
82 // Create orchestrator with configured command
83 let orchestrator = Arc::new(Orchestrator::with_startup_timeout(
84 config.models.clone(),
85 config.vllm_command.clone(),
86 config.checkpoint.clone(),
87 std::time::Duration::from_secs(config.startup_timeout_secs),
88 ));
89
90 // Create policy
91 let model_names: Vec<String> = config.models.keys().cloned().collect();
92 let policy = config.policy.build_policy(&model_names);
93
94 // Create switcher
95 let switcher = ModelSwitcher::new(orchestrator.clone(), policy);
96
97 // Spawn background scheduler if the policy uses one
98 let _scheduler_handle = switcher.clone().spawn_scheduler();
99
100 // Build control API router (served on separate admin port)
101 let control = control::control_router(switcher.clone());
102
103 // Build onwards targets from model configs
104 let targets = config.build_onwards_targets()?;
105
106 // Create onwards app state
107 let onwards_state = onwards::AppState::new(targets);
108 let onwards_router = onwards::build_router(onwards_state);
109
110 // Wrap with model switcher middleware
111 let mut app = onwards_router.layer(ModelSwitcherLayer::new(switcher.clone()));
112
113 // Install metrics layer and build metrics router if enabled
114 let metrics_router = if config.metrics_port > 0 {
115 let (prometheus_layer, handle) = onwards::build_metrics_layer_and_handle("llmux");
116 app = app.layer(prometheus_layer);
117 Some(onwards::build_metrics_router(handle))
118 } else {
119 None
120 };
121
122 Ok((app, metrics_router, control, switcher))
123}
124
125/// Run the warmup phase: start each model, run one inference, then sleep it.
126///
127/// Iterates all models sequentially. Each model is cold-started, warmed with
128/// a single inference request (to compile CUDA graphs and warm the allocator),
129/// then put to sleep using its configured eviction policy. After warmup,
130/// every model is in its warm sleeping state so the first real request
131/// triggers a fast wake rather than a cold start.
132pub async fn run_warmup(switcher: &ModelSwitcher) -> Result<()> {
133 let orchestrator = switcher.orchestrator();
134 let models = orchestrator.registered_models();
135
136 info!(count = models.len(), "Starting warmup phase");
137
138 for model in &models {
139 let eviction = orchestrator
140 .eviction_policy_for(model)
141 .expect("model registered but no eviction policy");
142
143 let port = switcher
144 .orchestrator()
145 .model_port(model)
146 .expect("model registered but no port");
147
148 let model_path = switcher
149 .orchestrator()
150 .model_path(model)
151 .expect("model registered but no model_path");
152
153 info!(model = %model, "Warmup: starting");
154 orchestrator
155 .ensure_running(model)
156 .await
157 .map_err(|e| anyhow::anyhow!("warmup: failed to start {}: {}", model, e))?;
158
159 info!(model = %model, "Warmup: running inference");
160 validate::run_warmup_inference(port, &model_path).await?;
161
162 info!(model = %model, ?eviction, "Warmup: sleeping");
163 orchestrator
164 .sleep_model(model, eviction)
165 .await
166 .map_err(|e| anyhow::anyhow!("warmup: failed to sleep {}: {}", model, e))?;
167
168 info!(model = %model, "Warmup: complete");
169 }
170
171 info!("Warmup phase complete — all models warmed and sleeping");
172 Ok(())
173}