1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
//! # llmux
//!
//! Zero-reload model switching for vLLM - manages multiple models on shared GPU.
//!
//! This crate provides:
//! - **Orchestrator**: Lazily starts vLLM processes on first request
//! - **Switcher**: Coordinates wake/sleep between models
//! - **Middleware**: Axum layer that integrates with onwards proxy
//!
//! ## Architecture
//!
//! ```text
//! ┌─────────────────────────────────────────────────────────────┐
//! │ llmux │
//! │ ┌─────────────────────────────────────────────────────┐ │
//! │ │ Orchestrator │ │
//! │ │ - Spawns vLLM processes lazily │ │
//! │ │ - Tracks: NotStarted | Starting | Running | Sleeping │ │
//! │ └─────────────────────────────────────────────────────┘ │
//! │ │ │
//! │ ┌─────────────────────────────────────────────────────┐ │
//! │ │ Middleware Layer │ │
//! │ │ - Extracts model from request │ │
//! │ │ - Ensures model ready before forwarding │ │
//! │ └─────────────────────────────────────────────────────┘ │
//! │ │ │
//! │ ┌─────────────────────────────────────────────────────┐ │
//! │ │ Onwards Proxy │ │
//! │ │ - Routes to vLLM by model name │ │
//! │ └─────────────────────────────────────────────────────┘ │
//! │ │ │
//! │ ┌───────────────────┼───────────────────┐ │
//! │ ▼ ▼ ▼ │
//! │ [vLLM:8001] [vLLM:8002] [vLLM:8003] │
//! │ (llama) (mistral) (qwen) │
//! └─────────────────────────────────────────────────────────────┘
//! ```
pub use ;
pub use ;
pub use ;
pub use ;
pub use ;
use Result;
use Arc;
use info;
/// Build the complete llmux stack
///
/// Returns an Axum router with:
/// - Model switching middleware
/// - Onwards proxy configured for all models
pub async