use crate::tide::{
BlockDenoiseConfig, BlockDenoiseLoop, BlockDenoiseStepStats, PredictiveOffloadInfo,
PredictiveOffloadParams, TideOffloadStats, enable_predictive_expert_offload,
};
use crate::{GenerateConfig, LLaDA2MoeConfig, LLaDA2Runner, LLaDA2RunnerBuilder, LLaDA2Weights};
use anyhow::Result;
pub struct TideRunner {
inner: LLaDA2Runner,
}
impl TideRunner {
pub fn builder() -> LLaDA2RunnerBuilder {
LLaDA2Runner::builder()
}
pub fn from_llada2(inner: LLaDA2Runner) -> Self {
Self { inner }
}
pub fn into_llada2(self) -> LLaDA2Runner {
self.inner
}
pub fn llada2(&self) -> &LLaDA2Runner {
&self.inner
}
pub fn llada2_mut(&mut self) -> &mut LLaDA2Runner {
&mut self.inner
}
pub fn config(&self) -> &LLaDA2MoeConfig {
self.inner.config()
}
pub fn predictive_offload_info(&self) -> Option<PredictiveOffloadInfo> {
self.inner.predictive_offload_info()
}
pub fn predictive_offload_enabled(&self) -> bool {
self.inner.predictive_offload_enabled()
}
pub fn jump_steps(&self) -> usize {
self.inner.jump_steps()
}
pub fn get_offload_stats(&mut self) -> TideOffloadStats {
self.inner.get_offload_stats()
}
pub fn generate(
&mut self,
input_ids: &[u32],
gen_cfg: &GenerateConfig,
) -> Result<(Vec<u32>, Vec<BlockDenoiseStepStats>)> {
self.inner.generate(gen_cfg, input_ids)
}
pub fn block_denoise_loop(
&mut self,
cfg: BlockDenoiseConfig,
) -> BlockDenoiseLoop<crate::runner::LLaDA2RunnerForward<'_>> {
self.inner.block_denoise_loop(cfg)
}
}
impl LLaDA2RunnerBuilder {
pub fn tide_enable_predictive_expert_offload(
mut self,
max_gpu_experts_per_layer: usize,
reserve_vram_gb: f64,
collect_stats: bool,
jump_steps: usize,
) -> Self {
self = self
.enable_predictive_expert_offload(max_gpu_experts_per_layer)
.reserve_vram_gb(reserve_vram_gb)
.jump_steps(jump_steps)
.moe_collect_stats(collect_stats);
self
}
}
pub fn preview_predictive_offload(
cfg: &LLaDA2MoeConfig,
weights: &LLaDA2Weights,
max_gpu_experts_per_layer: usize,
reserve_vram_gb: f64,
collect_stats: bool,
jump_steps: usize,
) -> Option<PredictiveOffloadInfo> {
let layer_count = crate::moe_offload::count_moe_layers(weights).max(1);
let mut params = PredictiveOffloadParams::new(
max_gpu_experts_per_layer,
cfg.num_experts,
layer_count,
cfg.expert_param_bytes_f32(),
);
params.reserve_vram_gb = reserve_vram_gb;
params.collect_stats = collect_stats;
params.jump_steps = jump_steps;
enable_predictive_expert_offload(¶ms).map(|(_, info)| info)
}