Skip to main content

llmux/
lib.rs

1//! # llmux
2//!
3//! Zero-reload model switching for vLLM - manages multiple models on shared GPU.
4//!
5//! This crate provides:
6//! - **Orchestrator**: Lazily starts vLLM processes on first request
7//! - **Switcher**: Coordinates wake/sleep between models
8//! - **Middleware**: Axum layer that integrates with onwards proxy
9//!
10//! ## Architecture
11//!
12//! ```text
13//! ┌─────────────────────────────────────────────────────────────┐
14//! │                     llmux                          │
15//! │  ┌─────────────────────────────────────────────────────┐   │
16//! │  │ Orchestrator                                         │   │
17//! │  │ - Spawns vLLM processes lazily                       │   │
18//! │  │ - Tracks: NotStarted | Starting | Running | Sleeping │   │
19//! │  └─────────────────────────────────────────────────────┘   │
20//! │                          │                                  │
21//! │  ┌─────────────────────────────────────────────────────┐   │
22//! │  │ Middleware Layer                                     │   │
23//! │  │ - Extracts model from request                        │   │
24//! │  │ - Ensures model ready before forwarding              │   │
25//! │  └─────────────────────────────────────────────────────┘   │
26//! │                          │                                  │
27//! │  ┌─────────────────────────────────────────────────────┐   │
28//! │  │ Onwards Proxy                                        │   │
29//! │  │ - Routes to vLLM by model name                       │   │
30//! │  └─────────────────────────────────────────────────────┘   │
31//! │                          │                                  │
32//! │      ┌───────────────────┼───────────────────┐             │
33//! │      ▼                   ▼                   ▼             │
34//! │  [vLLM:8001]        [vLLM:8002]         [vLLM:8003]        │
35//! │   (llama)           (mistral)           (qwen)            │
36//! └─────────────────────────────────────────────────────────────┘
37//! ```
38
39mod config;
40pub mod control;
41mod middleware;
42pub mod object_store;
43mod orchestrator;
44mod policy;
45mod switcher;
46pub(crate) mod types;
47pub mod validate;
48
49pub use config::{CheckpointConfig, Config, ModelConfig, ObjectStoreConfig, PolicyConfig};
50pub use middleware::{ModelSwitcherLayer, ModelSwitcherService};
51pub use orchestrator::{Orchestrator, OrchestratorError, ProcessState};
52pub use policy::{
53    CostAwarePolicy, FifoPolicy, PolicyContext, PolicyDecision, ScheduleContext, SwitchPolicy,
54    TimeSlicePolicy,
55};
56pub use switcher::{InFlightGuard, ModelSwitcher};
57pub use types::{
58    EvictionPolicy, ProcessStrategy, SwitchError, SwitchMode, SwitcherState, WeightStrategy,
59};
60
61use anyhow::Result;
62use std::sync::Arc;
63use tracing::info;
64
65/// Build the complete llmux stack
66///
67/// Returns:
68/// - The main Axum router (proxy + middleware)
69/// - An optional metrics router (when `config.metrics_port > 0`)
70/// - The control API router (for the admin port)
71/// - The model switcher (for driving warmup before serving)
72pub async fn build_app(
73    config: Config,
74) -> Result<(
75    axum::Router,
76    Option<axum::Router>,
77    axum::Router,
78    ModelSwitcher,
79)> {
80    info!("Building llmux with {} models", config.models.len());
81
82    // Create orchestrator with configured command
83    let orchestrator = Arc::new(Orchestrator::with_startup_timeout(
84        config.models.clone(),
85        config.vllm_command.clone(),
86        config.checkpoint.clone(),
87        std::time::Duration::from_secs(config.startup_timeout_secs),
88    ));
89
90    // Create policy
91    let model_names: Vec<String> = config.models.keys().cloned().collect();
92    let policy = config.policy.build_policy(&model_names);
93
94    // Create switcher
95    let switcher = ModelSwitcher::new(orchestrator.clone(), policy);
96
97    // Spawn background scheduler if the policy uses one
98    let _scheduler_handle = switcher.clone().spawn_scheduler();
99
100    // Build control API router (served on separate admin port)
101    let control = control::control_router(switcher.clone());
102
103    // Build onwards targets from model configs
104    let targets = config.build_onwards_targets()?;
105
106    // Create onwards app state
107    let onwards_state = onwards::AppState::new(targets);
108    let onwards_router = onwards::build_router(onwards_state);
109
110    // Wrap with model switcher middleware
111    let mut app = onwards_router.layer(ModelSwitcherLayer::new(switcher.clone()));
112
113    // Install metrics layer and build metrics router if enabled
114    let metrics_router = if config.metrics_port > 0 {
115        let (prometheus_layer, handle) = onwards::build_metrics_layer_and_handle("llmux");
116        app = app.layer(prometheus_layer);
117        Some(onwards::build_metrics_router(handle))
118    } else {
119        None
120    };
121
122    Ok((app, metrics_router, control, switcher))
123}
124
125/// Run the warmup phase: start each model, run one inference, then sleep it.
126///
127/// Iterates all models sequentially. Each model is cold-started, warmed with
128/// a single inference request (to compile CUDA graphs and warm the allocator),
129/// then put to sleep using its configured eviction policy. After warmup,
130/// every model is in its warm sleeping state so the first real request
131/// triggers a fast wake rather than a cold start.
132pub async fn run_warmup(switcher: &ModelSwitcher) -> Result<()> {
133    let orchestrator = switcher.orchestrator();
134    let models = orchestrator.registered_models();
135
136    info!(count = models.len(), "Starting warmup phase");
137
138    for model in &models {
139        let eviction = orchestrator
140            .eviction_policy_for(model)
141            .expect("model registered but no eviction policy");
142
143        let port = switcher
144            .orchestrator()
145            .model_port(model)
146            .expect("model registered but no port");
147
148        let model_path = switcher
149            .orchestrator()
150            .model_path(model)
151            .expect("model registered but no model_path");
152
153        info!(model = %model, "Warmup: starting");
154        orchestrator
155            .ensure_running(model)
156            .await
157            .map_err(|e| anyhow::anyhow!("warmup: failed to start {}: {}", model, e))?;
158
159        info!(model = %model, "Warmup: running inference");
160        validate::run_warmup_inference(port, &model_path).await?;
161
162        info!(model = %model, ?eviction, "Warmup: sleeping");
163        orchestrator
164            .sleep_model(model, eviction)
165            .await
166            .map_err(|e| anyhow::anyhow!("warmup: failed to sleep {}: {}", model, e))?;
167
168        info!(model = %model, "Warmup: complete");
169    }
170
171    info!("Warmup phase complete — all models warmed and sleeping");
172    Ok(())
173}