rlx_llada2/tide/
offload.rs1use rlx_core::device_memory_for_moe_offload;
19use rlx_runtime::{Device, ExpertPoolConfig};
20
21pub fn device_memory_for_offload(device: Device) -> Option<(usize, usize)> {
23 device_memory_for_moe_offload(device)
24}
25
26#[derive(Debug, Clone)]
28pub struct PredictiveOffloadParams {
29 pub max_gpu_experts_per_layer: usize,
30 pub reserve_vram_gb: f64,
31 pub collect_stats: bool,
32 pub jump_steps: usize,
33 pub device_memory: Option<(usize, usize)>,
35 pub memory_budget_bytes: Option<usize>,
36 pub num_experts: usize,
37 pub num_sparse_moe_layers: usize,
38 pub expert_param_bytes: usize,
39}
40
41impl PredictiveOffloadParams {
42 pub fn new(
43 max_gpu_experts_per_layer: usize,
44 num_experts: usize,
45 num_sparse_moe_layers: usize,
46 expert_param_bytes: usize,
47 ) -> Self {
48 Self {
49 max_gpu_experts_per_layer,
50 reserve_vram_gb: 1.5,
51 collect_stats: false,
52 jump_steps: 1,
53 device_memory: None,
54 memory_budget_bytes: None,
55 num_experts,
56 num_sparse_moe_layers,
57 expert_param_bytes,
58 }
59 }
60}
61
62#[derive(Debug, Clone, PartialEq, Eq)]
64pub struct PredictiveOffloadInfo {
65 pub enabled: bool,
66 pub gpu_expert_budget_per_layer: usize,
67 pub num_sparse_moe_layers: usize,
68 pub expert_param_bytes: usize,
69 pub cuda_free_bytes: Option<usize>,
70 pub cuda_total_bytes: Option<usize>,
71 pub reserve_bytes: usize,
72 pub jump_steps: usize,
73 pub collect_stats: bool,
74}
75
76pub fn gpu_expert_budget_from_device_memory(
78 free_bytes: usize,
79 total_bytes: usize,
80 expert_param_bytes: usize,
81 num_moe_layers: usize,
82 num_experts: usize,
83 max_gpu_experts_per_layer: usize,
84 reserve_vram_gb: f64,
85) -> (usize, usize) {
86 let reserve_gb_bytes = (reserve_vram_gb * (1024f64).powi(3)).max(0.0) as usize;
87 let reserve_fraction_bytes = (0.1 * total_bytes as f64) as usize;
88 let reserve_bytes = reserve_gb_bytes.max(reserve_fraction_bytes);
89 let usable_bytes = free_bytes.saturating_sub(reserve_bytes);
90 let max_budget = max_gpu_experts_per_layer.min(num_experts);
91 let computed = if expert_param_bytes > 0 && num_moe_layers > 0 {
92 usable_bytes / (expert_param_bytes.saturating_mul(num_moe_layers))
93 } else {
94 max_budget
95 };
96 (computed.min(max_budget), reserve_bytes)
97}
98
99pub fn enable_predictive_expert_offload(
101 params: &PredictiveOffloadParams,
102) -> Option<(Vec<ExpertPoolConfig>, PredictiveOffloadInfo)> {
103 let num_experts = params.num_experts;
104 if params.num_sparse_moe_layers == 0 || num_experts == 0 {
105 return None;
106 }
107
108 let (free_bytes, total_bytes) = if let Some(pair) = params.device_memory {
109 pair
110 } else if let Some(b) = params.memory_budget_bytes {
111 (b, b)
112 } else if let Some(total) = rlx_runtime::memory_estimate::available_unified_memory() {
113 (total, total)
114 } else {
115 (usize::MAX / 2, usize::MAX)
116 };
117
118 let (gpu_budget, reserve_bytes) = gpu_expert_budget_from_device_memory(
119 free_bytes,
120 total_bytes,
121 params.expert_param_bytes,
122 params.num_sparse_moe_layers,
123 num_experts,
124 params.max_gpu_experts_per_layer,
125 params.reserve_vram_gb,
126 );
127
128 if gpu_budget >= num_experts {
129 return None;
130 }
131
132 let refresh = rlx_runtime::ExpertRefreshPolicy::EveryDenoiseSteps(params.jump_steps.max(1));
133 let pools: Vec<_> = (0..params.num_sparse_moe_layers)
134 .map(|_| ExpertPoolConfig::new(num_experts, gpu_budget, refresh))
135 .collect();
136
137 let info = PredictiveOffloadInfo {
138 enabled: true,
139 gpu_expert_budget_per_layer: gpu_budget,
140 num_sparse_moe_layers: params.num_sparse_moe_layers,
141 expert_param_bytes: params.expert_param_bytes,
142 cuda_free_bytes: params.device_memory.map(|(f, _)| f).or(Some(free_bytes)),
143 cuda_total_bytes: params.device_memory.map(|(_, t)| t).or(Some(total_bytes)),
144 reserve_bytes,
145 jump_steps: params.jump_steps.max(1),
146 collect_stats: params.collect_stats,
147 };
148
149 Some((pools, info))
150}