Skip to main content

llmux/
lib.rs

1//! # llmux
2//!
3//! Zero-reload model switching for vLLM - manages multiple models on shared GPU.
4//!
5//! This crate provides:
6//! - **Orchestrator**: Lazily starts vLLM processes on first request
7//! - **Switcher**: Coordinates wake/sleep between models
8//! - **Middleware**: Axum layer that integrates with onwards proxy
9//!
10//! ## Architecture
11//!
12//! ```text
13//! ┌─────────────────────────────────────────────────────────────┐
14//! │                     llmux                          │
15//! │  ┌─────────────────────────────────────────────────────┐   │
16//! │  │ Orchestrator                                         │   │
17//! │  │ - Spawns vLLM processes lazily                       │   │
18//! │  │ - Tracks: NotStarted | Starting | Running | Sleeping │   │
19//! │  └─────────────────────────────────────────────────────┘   │
20//! │                          │                                  │
21//! │  ┌─────────────────────────────────────────────────────┐   │
22//! │  │ Middleware Layer                                     │   │
23//! │  │ - Extracts model from request                        │   │
24//! │  │ - Ensures model ready before forwarding              │   │
25//! │  └─────────────────────────────────────────────────────┘   │
26//! │                          │                                  │
27//! │  ┌─────────────────────────────────────────────────────┐   │
28//! │  │ Onwards Proxy                                        │   │
29//! │  │ - Routes to vLLM by model name                       │   │
30//! │  └─────────────────────────────────────────────────────┘   │
31//! │                          │                                  │
32//! │      ┌───────────────────┼───────────────────┐             │
33//! │      ▼                   ▼                   ▼             │
34//! │  [vLLM:8001]        [vLLM:8002]         [vLLM:8003]        │
35//! │   (llama)           (mistral)           (qwen)            │
36//! └─────────────────────────────────────────────────────────────┘
37//! ```
38
39mod config;
40pub mod control;
41mod middleware;
42pub mod object_store;
43mod orchestrator;
44mod policy;
45mod switcher;
46pub mod validate;
47
48pub use config::{CheckpointConfig, Config, ModelConfig, ObjectStoreConfig, PolicyConfig};
49pub use middleware::{ModelSwitcherLayer, ModelSwitcherService};
50pub use orchestrator::{Orchestrator, OrchestratorError, ProcessState};
51pub use policy::{
52    CostAwarePolicy, FifoPolicy, PolicyContext, PolicyDecision, ScheduleContext, SwitchPolicy,
53    TimeSlicePolicy,
54};
55pub use switcher::{
56    EvictionPolicy, ModelSwitcher, ProcessStrategy, SwitchError, SwitchMode, SwitcherState,
57    WeightStrategy,
58};
59
60use anyhow::Result;
61use std::sync::Arc;
62use tracing::info;
63
64/// Build the complete llmux stack
65///
66/// Returns:
67/// - The main Axum router (proxy + middleware)
68/// - An optional metrics router (when `config.metrics_port > 0`)
69/// - The control API router (for the admin port)
70/// - The model switcher (for driving warmup before serving)
71pub async fn build_app(
72    config: Config,
73) -> Result<(axum::Router, Option<axum::Router>, axum::Router, ModelSwitcher)> {
74    info!("Building llmux with {} models", config.models.len());
75
76    // Create orchestrator with configured command
77    let orchestrator = Arc::new(Orchestrator::with_options(
78        config.models.clone(),
79        config.vllm_command.clone(),
80        config.checkpoint.clone(),
81    ));
82
83    // Create policy
84    let model_names: Vec<String> = config.models.keys().cloned().collect();
85    let policy = config.policy.build_policy(&model_names);
86
87    // Create switcher
88    let switcher = ModelSwitcher::new(orchestrator.clone(), policy);
89
90    // Spawn background scheduler if the policy uses one
91    let _scheduler_handle = switcher.clone().spawn_scheduler();
92
93    // Build control API router (served on separate admin port)
94    let control = control::control_router(switcher.clone());
95
96    // Build onwards targets from model configs
97    let targets = config.build_onwards_targets()?;
98
99    // Create onwards app state
100    let onwards_state = onwards::AppState::new(targets);
101    let onwards_router = onwards::build_router(onwards_state);
102
103    // Wrap with model switcher middleware
104    let mut app = onwards_router.layer(ModelSwitcherLayer::new(switcher.clone()));
105
106    // Install metrics layer and build metrics router if enabled
107    let metrics_router = if config.metrics_port > 0 {
108        let (prometheus_layer, handle) = onwards::build_metrics_layer_and_handle("llmux");
109        app = app.layer(prometheus_layer);
110        Some(onwards::build_metrics_router(handle))
111    } else {
112        None
113    };
114
115    Ok((app, metrics_router, control, switcher))
116}
117
118/// Run the warmup phase: start each model, run one inference, then sleep it.
119///
120/// Iterates all models sequentially. Each model is cold-started, warmed with
121/// a single inference request (to compile CUDA graphs and warm the allocator),
122/// then put to sleep using its configured eviction policy. After warmup,
123/// every model is in its warm sleeping state so the first real request
124/// triggers a fast wake rather than a cold start.
125pub async fn run_warmup(switcher: &ModelSwitcher) -> Result<()> {
126    let orchestrator = switcher.orchestrator();
127    let models = orchestrator.registered_models();
128
129    info!(count = models.len(), "Starting warmup phase");
130
131    for model in &models {
132        let eviction = orchestrator
133            .eviction_policy_for(model)
134            .expect("model registered but no eviction policy");
135
136        let port = switcher
137            .orchestrator()
138            .model_port(model)
139            .expect("model registered but no port");
140
141        let model_path = switcher
142            .orchestrator()
143            .model_path(model)
144            .expect("model registered but no model_path");
145
146        info!(model = %model, "Warmup: starting");
147        orchestrator
148            .ensure_running(model)
149            .await
150            .map_err(|e| anyhow::anyhow!("warmup: failed to start {}: {}", model, e))?;
151
152        info!(model = %model, "Warmup: running inference");
153        validate::run_warmup_inference(port, &model_path).await?;
154
155        info!(model = %model, ?eviction, "Warmup: sleeping");
156        orchestrator
157            .sleep_model(model, eviction)
158            .await
159            .map_err(|e| anyhow::anyhow!("warmup: failed to sleep {}: {}", model, e))?;
160
161        info!(model = %model, "Warmup: complete");
162    }
163
164    info!("Warmup phase complete — all models warmed and sleeping");
165    Ok(())
166}