llmux/lib.rs
1//! # llmux
2//!
3//! Zero-reload model switching for vLLM - manages multiple models on shared GPU.
4//!
5//! This crate provides:
6//! - **Orchestrator**: Lazily starts vLLM processes on first request
7//! - **Switcher**: Coordinates wake/sleep between models
8//! - **Middleware**: Axum layer that integrates with onwards proxy
9//!
10//! ## Architecture
11//!
12//! ```text
13//! ┌─────────────────────────────────────────────────────────────┐
14//! │ llmux │
15//! │ ┌─────────────────────────────────────────────────────┐ │
16//! │ │ Orchestrator │ │
17//! │ │ - Spawns vLLM processes lazily │ │
18//! │ │ - Tracks: NotStarted | Starting | Running | Sleeping │ │
19//! │ └─────────────────────────────────────────────────────┘ │
20//! │ │ │
21//! │ ┌─────────────────────────────────────────────────────┐ │
22//! │ │ Middleware Layer │ │
23//! │ │ - Extracts model from request │ │
24//! │ │ - Ensures model ready before forwarding │ │
25//! │ └─────────────────────────────────────────────────────┘ │
26//! │ │ │
27//! │ ┌─────────────────────────────────────────────────────┐ │
28//! │ │ Onwards Proxy │ │
29//! │ │ - Routes to vLLM by model name │ │
30//! │ └─────────────────────────────────────────────────────┘ │
31//! │ │ │
32//! │ ┌───────────────────┼───────────────────┐ │
33//! │ ▼ ▼ ▼ │
34//! │ [vLLM:8001] [vLLM:8002] [vLLM:8003] │
35//! │ (llama) (mistral) (qwen) │
36//! └─────────────────────────────────────────────────────────────┘
37//! ```
38
39mod config;
40pub mod control;
41mod middleware;
42pub mod object_store;
43mod orchestrator;
44mod policy;
45mod switcher;
46pub mod validate;
47
48pub use config::{CheckpointConfig, Config, ModelConfig, ObjectStoreConfig, PolicyConfig};
49pub use middleware::{ModelSwitcherLayer, ModelSwitcherService};
50pub use orchestrator::{Orchestrator, OrchestratorError, ProcessState};
51pub use policy::{
52 CostAwarePolicy, FifoPolicy, PolicyContext, PolicyDecision, ScheduleContext, SwitchPolicy,
53 TimeSlicePolicy,
54};
55pub use switcher::{
56 EvictionPolicy, ModelSwitcher, ProcessStrategy, SwitchError, SwitchMode, SwitcherState,
57 WeightStrategy,
58};
59
60use anyhow::Result;
61use std::sync::Arc;
62use tracing::info;
63
64/// Build the complete llmux stack
65///
66/// Returns:
67/// - The main Axum router (proxy + middleware)
68/// - An optional metrics router (when `config.metrics_port > 0`)
69/// - The control API router (for the admin port)
70/// - The model switcher (for driving warmup before serving)
71pub async fn build_app(
72 config: Config,
73) -> Result<(axum::Router, Option<axum::Router>, axum::Router, ModelSwitcher)> {
74 info!("Building llmux with {} models", config.models.len());
75
76 // Create orchestrator with configured command
77 let orchestrator = Arc::new(Orchestrator::with_options(
78 config.models.clone(),
79 config.vllm_command.clone(),
80 config.checkpoint.clone(),
81 ));
82
83 // Create policy
84 let model_names: Vec<String> = config.models.keys().cloned().collect();
85 let policy = config.policy.build_policy(&model_names);
86
87 // Create switcher
88 let switcher = ModelSwitcher::new(orchestrator.clone(), policy);
89
90 // Spawn background scheduler if the policy uses one
91 let _scheduler_handle = switcher.clone().spawn_scheduler();
92
93 // Build control API router (served on separate admin port)
94 let control = control::control_router(switcher.clone());
95
96 // Build onwards targets from model configs
97 let targets = config.build_onwards_targets()?;
98
99 // Create onwards app state
100 let onwards_state = onwards::AppState::new(targets);
101 let onwards_router = onwards::build_router(onwards_state);
102
103 // Wrap with model switcher middleware
104 let mut app = onwards_router.layer(ModelSwitcherLayer::new(switcher.clone()));
105
106 // Install metrics layer and build metrics router if enabled
107 let metrics_router = if config.metrics_port > 0 {
108 let (prometheus_layer, handle) = onwards::build_metrics_layer_and_handle("llmux");
109 app = app.layer(prometheus_layer);
110 Some(onwards::build_metrics_router(handle))
111 } else {
112 None
113 };
114
115 Ok((app, metrics_router, control, switcher))
116}
117
118/// Run the warmup phase: start each model, run one inference, then sleep it.
119///
120/// Iterates all models sequentially. Each model is cold-started, warmed with
121/// a single inference request (to compile CUDA graphs and warm the allocator),
122/// then put to sleep using its configured eviction policy. After warmup,
123/// every model is in its warm sleeping state so the first real request
124/// triggers a fast wake rather than a cold start.
125pub async fn run_warmup(switcher: &ModelSwitcher) -> Result<()> {
126 let orchestrator = switcher.orchestrator();
127 let models = orchestrator.registered_models();
128
129 info!(count = models.len(), "Starting warmup phase");
130
131 for model in &models {
132 let eviction = orchestrator
133 .eviction_policy_for(model)
134 .expect("model registered but no eviction policy");
135
136 let port = switcher
137 .orchestrator()
138 .model_port(model)
139 .expect("model registered but no port");
140
141 let model_path = switcher
142 .orchestrator()
143 .model_path(model)
144 .expect("model registered but no model_path");
145
146 info!(model = %model, "Warmup: starting");
147 orchestrator
148 .ensure_running(model)
149 .await
150 .map_err(|e| anyhow::anyhow!("warmup: failed to start {}: {}", model, e))?;
151
152 info!(model = %model, "Warmup: running inference");
153 validate::run_warmup_inference(port, &model_path).await?;
154
155 info!(model = %model, ?eviction, "Warmup: sleeping");
156 orchestrator
157 .sleep_model(model, eviction)
158 .await
159 .map_err(|e| anyhow::anyhow!("warmup: failed to sleep {}: {}", model, e))?;
160
161 info!(model = %model, "Warmup: complete");
162 }
163
164 info!("Warmup phase complete — all models warmed and sleeping");
165 Ok(())
166}