use crate::error::{RealizarError, Result};
use crate::gguf::qwen3_moe_load::Qwen3MoeQuantizedLayer;
use crate::gguf::OwnedQuantizedModel;
pub struct OwnedQuantizedModelWgpu {
pub(crate) model: OwnedQuantizedModel,
}
impl OwnedQuantizedModelWgpu {
pub fn new(model: OwnedQuantizedModel) -> Result<Self> {
Ok(Self { model })
}
#[allow(clippy::too_many_arguments)]
pub fn forward_qwen3_moe_wgpu(
&self,
token_ids: &[u32],
moe_layers: &[Qwen3MoeQuantizedLayer],
num_experts: usize,
num_experts_per_tok: usize,
moe_intermediate: usize,
_data: &[u8],
) -> Result<Vec<f32>> {
if token_ids.is_empty() {
return Err(RealizarError::InvalidShape {
reason: "forward_qwen3_moe_wgpu: token_ids must not be empty".to_string(),
});
}
if moe_layers.len() != self.model.layers.len() {
return Err(RealizarError::InvalidShape {
reason: format!(
"forward_qwen3_moe_wgpu: moe_layers.len() = {} but model has {} decoder layers",
moe_layers.len(),
self.model.layers.len()
),
});
}
if num_experts == 0 || num_experts_per_tok == 0 || moe_intermediate == 0 {
return Err(RealizarError::InvalidShape {
reason: format!(
"forward_qwen3_moe_wgpu: incomplete MoE config — num_experts={num_experts}, \
num_experts_per_tok={num_experts_per_tok}, moe_intermediate={moe_intermediate}. \
Caller must supply all three from GGUF metadata."
),
});
}
if num_experts_per_tok > num_experts {
return Err(RealizarError::InvalidShape {
reason: format!(
"forward_qwen3_moe_wgpu: num_experts_per_tok ({num_experts_per_tok}) \
exceeds num_experts ({num_experts})"
),
});
}
Err(RealizarError::UnsupportedOperation {
operation: "forward_qwen3_moe_wgpu".to_string(),
reason: "M-GPU-MOE-2.0 stub on OwnedQuantizedModelWgpu. \
Per qwen3-moe-forward-gpu-v1 v1.2.0 option I, the implementation lands \
incrementally: M-GPU-MOE-2.1 (per-expert wgpu dispatch helpers) → \
M-GPU-MOE-2.2 (full forward integration analog of \
forward_qwen3_moe_cuda) → M-GPU-MOE-2.3 (cosine-vs-CPU parity test). \
Until 2.2 lands, callers on non-CUDA hardware should fall back to \
OwnedQuantizedModel::forward_qwen3_moe (CPU LAZY-FUSED-MATVEC, \
~30 tok/s on Qwen3-Coder-30B-A3B-Instruct-Q4_K_M)."
.to_string(),
})
}
}
#[cfg(test)]
mod owned_quantized_model_wgpu_tests {
use super::*;
#[test]
fn forward_qwen3_moe_wgpu_precondition_empty_tokens_returns_invalid_shape() {
let _: fn(
&OwnedQuantizedModelWgpu,
&[u32],
&[Qwen3MoeQuantizedLayer],
usize,
usize,
usize,
&[u8],
) -> Result<Vec<f32>> = OwnedQuantizedModelWgpu::forward_qwen3_moe_wgpu;
}
}