#![cfg(target_os = "macos")]
use std::path::{Path, PathBuf};
mod common;
use common::c_backend::Ctx;
use moeflux::riir::RsCtx;
pub trait DiffBackend {
fn open(
weights: &Path,
manifest: &Path,
vocab: &Path,
experts_dir: &Path,
experts_per_tok: u32,
use_2bit: bool,
) -> Self;
fn n_vocab(&self) -> usize;
fn n_ctx(&self) -> usize;
fn eos(&self) -> i32;
fn model_name(&self) -> &'static str;
fn embed(&self, token_id: i32) -> Vec<f32>;
fn rms_norm_cpu(&self, weight_name: &str, x: &[f32]) -> Vec<f32>;
fn apply_rotary_emb(
&self,
pos: i32,
q: &[f32],
k: &[f32],
) -> (Vec<f32>, Vec<f32>);
fn rms_norm_per_head_cpu(
&self,
weight_name: &str,
num_heads: usize,
head_dim: usize,
x: &[f32],
) -> Vec<f32>;
fn sdpa_cpu(
&self,
kv_len: i32,
q: &[f32],
q_gate: &[f32],
k_cache: &[f32],
v_cache: &[f32],
) -> Vec<f32>;
fn lm_head_cpu(&self, x: &[f32]) -> Vec<f32>;
fn moe_router_cpu(&self, scores: Vec<f32>, k: usize) -> (Vec<i32>, Vec<f32>);
fn conv1d_step_cpu(
&self,
weight_name: &str,
channels: usize,
kernel_size: usize,
conv_state: &[f32],
new_input: &[f32],
) -> Vec<f32>;
fn rms_norm_bare_cpu(&self, eps: f32, x: &[f32]) -> Vec<f32>;
fn rms_norm_gated_cpu(
&self,
weight_name: &str,
eps: f32,
x: &[f32],
z: &[f32],
) -> Vec<f32>;
#[allow(clippy::too_many_arguments)]
fn gated_delta_recurrence_cpu(
&self,
layer_idx: usize,
alpha: &[f32],
beta: &[f32],
q: &[f32],
k: &[f32],
v: &[f32],
v_heads: usize,
k_heads: usize,
key_dim: usize,
value_dim: usize,
ssm_state_in: Vec<f32>,
) -> (Vec<f32>, Vec<f32>);
fn load_expert_bytes(&self, layer_idx: i32, expert_idx: i32) -> Vec<u8>;
fn gpu_rms_norm_fused(
&mut self,
x: &[f32],
weight_bf16: &[u8],
) -> Vec<f32>;
fn gpu_expert_forward(
&mut self,
expert_data: &[u8],
h_post: &[f32],
) -> Vec<f32>;
#[allow(clippy::too_many_arguments)]
fn gpu_batched_experts_forward(
&mut self,
actual_k: i32,
expert_data: &[u8],
h_post: &[f32],
h_mid: &[f32],
shared_out: &[f32],
expert_weights: &[f32],
shared_gate_score: f32,
) -> Vec<f32>;
#[allow(clippy::too_many_arguments)]
fn attn_scores_batched(
&mut self,
num_heads: u32,
num_kv_heads: u32,
head_dim: u32,
seq_len: u32,
q: &[f32],
k_cache: &[f32],
scale: f32,
) -> Vec<f32>;
fn attn_softmax_batched(
&mut self,
num_heads: u32,
seq_len: u32,
scores_in: &[f32],
) -> Vec<f32>;
#[allow(clippy::too_many_arguments)]
fn attn_values_batched(
&mut self,
num_heads: u32,
num_kv_heads: u32,
head_dim: u32,
seq_len: u32,
scores: &[f32],
v_cache: &[f32],
) -> Vec<f32>;
fn sigmoid_gate(
&mut self,
dim: u32,
gate: &[f32],
x_in: &[f32],
) -> Vec<f32>;
#[allow(clippy::too_many_arguments)]
fn begin_deferred_experts(
&mut self,
actual_k: i32,
expert_data: &[u8],
h_post: &[f32],
h_mid: &[f32],
shared_out: &[f32],
expert_weights: &[f32],
shared_gate_score: f32,
);
fn complete_deferred_experts(&mut self) -> Vec<f32>;
fn discard_deferred_experts(&mut self);
fn layer_forward_dump(
&mut self,
layer_idx: i32,
pos: i32,
hidden_in: &[f32],
) -> Vec<f32>;
fn eval_prompt(&mut self, tokens: &[i32], start_pos: usize) -> Vec<f32>;
fn eval_token(&mut self, token: i32, pos: usize) -> Vec<f32>;
fn memory_clear(&mut self);
fn memory_seq_rm(&mut self, p0: i32, p1: i32) -> bool;
fn memory_seq_pos_max(&self) -> i32;
}
pub struct CBackend(pub Ctx);
impl DiffBackend for CBackend {
fn open(
weights: &Path,
manifest: &Path,
vocab: &Path,
experts_dir: &Path,
experts_per_tok: u32,
use_2bit: bool,
) -> Self {
Self(
Ctx::open(
weights,
manifest,
vocab,
experts_dir,
experts_per_tok,
use_2bit,
)
.expect("CBackend Ctx::open"),
)
}
fn n_vocab(&self) -> usize {
self.0.n_vocab()
}
fn n_ctx(&self) -> usize {
self.0.n_ctx()
}
fn eos(&self) -> i32 {
self.0.eos()
}
fn model_name(&self) -> &'static str {
self.0.model_name()
}
fn embed(&self, token_id: i32) -> Vec<f32> {
let mut out = vec![0.0f32; moeflux::riir::VARIANT.hidden_dim];
self.0.embed(token_id, &mut out).expect("CBackend embed");
out
}
fn rms_norm_cpu(&self, weight_name: &str, x: &[f32]) -> Vec<f32> {
let mut out = vec![0.0f32; moeflux::riir::VARIANT.hidden_dim];
self.0
.rms_norm_cpu(weight_name, x, &mut out)
.expect("CBackend rms_norm_cpu");
out
}
fn apply_rotary_emb(
&self,
pos: i32,
q: &[f32],
k: &[f32],
) -> (Vec<f32>, Vec<f32>) {
let mut q_out = q.to_vec();
let mut k_out = k.to_vec();
self.0
.apply_rotary_emb(pos, &mut q_out, &mut k_out)
.expect("CBackend apply_rotary_emb");
(q_out, k_out)
}
fn rms_norm_per_head_cpu(
&self,
weight_name: &str,
num_heads: usize,
head_dim: usize,
x: &[f32],
) -> Vec<f32> {
let mut out = x.to_vec();
self.0
.rms_norm_per_head_cpu(weight_name, num_heads, head_dim, &mut out)
.expect("CBackend rms_norm_per_head_cpu");
out
}
fn sdpa_cpu(
&self,
kv_len: i32,
q: &[f32],
q_gate: &[f32],
k_cache: &[f32],
v_cache: &[f32],
) -> Vec<f32> {
let mut out = vec![0.0f32; q.len()];
self.0
.sdpa_cpu(kv_len, q, q_gate, k_cache, v_cache, &mut out)
.expect("CBackend sdpa_cpu");
out
}
fn lm_head_cpu(&self, x: &[f32]) -> Vec<f32> {
let mut out = vec![0.0f32; self.0.n_vocab()];
self.0
.lm_head_cpu(x, &mut out)
.expect("CBackend lm_head_cpu");
out
}
fn moe_router_cpu(&self, scores: Vec<f32>, k: usize) -> (Vec<i32>, Vec<f32>) {
let mut s = scores;
let mut idx = vec![0i32; k];
let mut w = vec![0.0f32; k];
self.0
.moe_router_cpu(&mut s, k, &mut idx, &mut w)
.expect("CBackend moe_router_cpu");
(idx, w)
}
fn conv1d_step_cpu(
&self,
weight_name: &str,
channels: usize,
kernel_size: usize,
conv_state: &[f32],
new_input: &[f32],
) -> Vec<f32> {
let mut out = vec![0.0f32; channels];
self.0
.conv1d_step_cpu(
weight_name,
channels,
kernel_size,
conv_state,
new_input,
&mut out,
)
.expect("CBackend conv1d_step_cpu");
out
}
fn rms_norm_bare_cpu(&self, eps: f32, x: &[f32]) -> Vec<f32> {
let mut out = vec![0.0f32; x.len()];
self.0
.rms_norm_bare_cpu(eps, x, &mut out)
.expect("CBackend rms_norm_bare_cpu");
out
}
fn rms_norm_gated_cpu(
&self,
weight_name: &str,
eps: f32,
x: &[f32],
z: &[f32],
) -> Vec<f32> {
let mut out = vec![0.0f32; x.len()];
self.0
.rms_norm_gated_cpu(weight_name, eps, x, z, &mut out)
.expect("CBackend rms_norm_gated_cpu");
out
}
fn gated_delta_recurrence_cpu(
&self,
layer_idx: usize,
alpha: &[f32],
beta: &[f32],
q: &[f32],
k: &[f32],
v: &[f32],
v_heads: usize,
k_heads: usize,
key_dim: usize,
value_dim: usize,
ssm_state_in: Vec<f32>,
) -> (Vec<f32>, Vec<f32>) {
let mut state = ssm_state_in;
let mut out = vec![0.0f32; v_heads * value_dim];
self.0
.gated_delta_recurrence_cpu(
layer_idx,
alpha,
beta,
q,
k,
v,
v_heads,
k_heads,
key_dim,
value_dim,
&mut state,
&mut out,
)
.expect("CBackend gated_delta_recurrence_cpu");
(state, out)
}
fn load_expert_bytes(&self, layer_idx: i32, expert_idx: i32) -> Vec<u8> {
let mut out = vec![0u8; moeflux::riir::VARIANT.expert_size_4bit()];
self.0
.load_expert_bytes(layer_idx, expert_idx, &mut out)
.expect("CBackend load_expert_bytes");
out
}
fn gpu_rms_norm_fused(
&mut self,
x: &[f32],
weight_bf16: &[u8],
) -> Vec<f32> {
let mut out = vec![0.0f32; moeflux::riir::VARIANT.hidden_dim];
self.0
.gpu_rms_norm_fused(x, weight_bf16, &mut out)
.expect("CBackend gpu_rms_norm_fused");
out
}
fn gpu_expert_forward(
&mut self,
expert_data: &[u8],
h_post: &[f32],
) -> Vec<f32> {
let mut out = vec![0.0f32; moeflux::riir::VARIANT.hidden_dim];
self.0
.gpu_expert_forward(expert_data, h_post, &mut out)
.expect("CBackend gpu_expert_forward");
out
}
fn gpu_batched_experts_forward(
&mut self,
actual_k: i32,
expert_data: &[u8],
h_post: &[f32],
h_mid: &[f32],
shared_out: &[f32],
expert_weights: &[f32],
shared_gate_score: f32,
) -> Vec<f32> {
let mut out = vec![0.0f32; moeflux::riir::VARIANT.hidden_dim];
self.0
.gpu_batched_experts_forward(
actual_k,
expert_data,
h_post,
h_mid,
shared_out,
expert_weights,
shared_gate_score,
&mut out,
)
.expect("CBackend gpu_batched_experts_forward");
out
}
fn attn_scores_batched(
&mut self,
num_heads: u32,
num_kv_heads: u32,
head_dim: u32,
seq_len: u32,
q: &[f32],
k_cache: &[f32],
scale: f32,
) -> Vec<f32> {
let mut out = vec![0.0f32; (num_heads * seq_len) as usize];
self.0
.attn_scores_batched(
num_heads as i32,
num_kv_heads as i32,
head_dim as i32,
seq_len as i32,
q,
k_cache,
scale,
&mut out,
)
.expect("CBackend attn_scores_batched");
out
}
fn attn_softmax_batched(
&mut self,
num_heads: u32,
seq_len: u32,
scores_in: &[f32],
) -> Vec<f32> {
let mut out = scores_in.to_vec();
self.0
.attn_softmax_batched(num_heads as i32, seq_len as i32, &mut out)
.expect("CBackend attn_softmax_batched");
out
}
fn attn_values_batched(
&mut self,
num_heads: u32,
num_kv_heads: u32,
head_dim: u32,
seq_len: u32,
scores: &[f32],
v_cache: &[f32],
) -> Vec<f32> {
let mut out = vec![0.0f32; (num_heads * head_dim) as usize];
self.0
.attn_values_batched(
num_heads as i32,
num_kv_heads as i32,
head_dim as i32,
seq_len as i32,
scores,
v_cache,
&mut out,
)
.expect("CBackend attn_values_batched");
out
}
fn sigmoid_gate(
&mut self,
dim: u32,
gate: &[f32],
x_in: &[f32],
) -> Vec<f32> {
let mut out = x_in.to_vec();
self.0
.sigmoid_gate(dim as i32, gate, &mut out)
.expect("CBackend sigmoid_gate");
out
}
fn begin_deferred_experts(
&mut self,
actual_k: i32,
expert_data: &[u8],
h_post: &[f32],
h_mid: &[f32],
shared_out: &[f32],
expert_weights: &[f32],
shared_gate_score: f32,
) {
self.0
.begin_deferred_experts(
actual_k,
expert_data,
h_post,
h_mid,
shared_out,
expert_weights,
shared_gate_score,
)
.expect("CBackend begin_deferred_experts");
}
fn complete_deferred_experts(&mut self) -> Vec<f32> {
let mut out = vec![0.0f32; moeflux::riir::VARIANT.hidden_dim];
self.0
.complete_deferred_experts(&mut out)
.expect("CBackend complete_deferred_experts");
out
}
fn discard_deferred_experts(&mut self) {
self.0
.discard_deferred_experts()
.expect("CBackend discard_deferred_experts");
}
fn layer_forward_dump(
&mut self,
layer_idx: i32,
pos: i32,
hidden_in: &[f32],
) -> Vec<f32> {
let mut out = vec![0.0f32; moeflux::riir::VARIANT.hidden_dim];
self.0
.layer_forward_dump(layer_idx, pos, hidden_in, &mut out)
.expect("CBackend layer_forward_dump");
out
}
fn eval_prompt(&mut self, tokens: &[i32], start_pos: usize) -> Vec<f32> {
let mut logits = vec![0.0f32; self.0.n_vocab()];
self.0
.eval_prompt(tokens, start_pos, 0, &mut logits)
.expect("CBackend eval_prompt");
logits
}
fn eval_token(&mut self, token: i32, pos: usize) -> Vec<f32> {
let mut logits = vec![0.0f32; self.0.n_vocab()];
self.0
.eval_token(token, pos, 0, &mut logits)
.expect("CBackend eval_token");
logits
}
fn memory_clear(&mut self) {
self.0.memory_clear()
}
fn memory_seq_rm(&mut self, p0: i32, p1: i32) -> bool {
self.0.memory_seq_rm(0, p0, p1)
}
fn memory_seq_pos_max(&self) -> i32 {
self.0.memory_seq_pos_max(0)
}
}
pub struct RsBackend(RsCtx);
impl DiffBackend for RsBackend {
fn open(
weights: &Path,
manifest: &Path,
vocab: &Path,
experts_dir: &Path,
experts_per_tok: u32,
use_2bit: bool,
) -> Self {
Self(
RsCtx::open(
weights,
manifest,
vocab,
experts_dir,
experts_per_tok,
use_2bit,
)
.expect("RsBackend RsCtx::open"),
)
}
fn n_vocab(&self) -> usize {
self.0.n_vocab()
}
fn n_ctx(&self) -> usize {
self.0.n_ctx()
}
fn eos(&self) -> i32 {
self.0.eos()
}
fn model_name(&self) -> &'static str {
self.0.model_name()
}
fn embed(&self, token_id: i32) -> Vec<f32> {
let mut out = vec![0.0f32; moeflux::riir::VARIANT.hidden_dim];
self.0.embed(token_id, &mut out).expect("RsBackend embed");
out
}
fn rms_norm_cpu(&self, weight_name: &str, x: &[f32]) -> Vec<f32> {
let mut out = vec![0.0f32; moeflux::riir::VARIANT.hidden_dim];
self.0
.rms_norm_cpu(weight_name, x, &mut out)
.expect("RsBackend rms_norm_cpu");
out
}
fn apply_rotary_emb(
&self,
pos: i32,
q: &[f32],
k: &[f32],
) -> (Vec<f32>, Vec<f32>) {
let mut q_out = q.to_vec();
let mut k_out = k.to_vec();
self.0
.apply_rotary_emb(pos, &mut q_out, &mut k_out)
.expect("RsBackend apply_rotary_emb");
(q_out, k_out)
}
fn rms_norm_per_head_cpu(
&self,
weight_name: &str,
num_heads: usize,
head_dim: usize,
x: &[f32],
) -> Vec<f32> {
let mut out = x.to_vec();
self.0
.rms_norm_per_head_cpu(weight_name, num_heads, head_dim, &mut out)
.expect("RsBackend rms_norm_per_head_cpu");
out
}
fn sdpa_cpu(
&self,
kv_len: i32,
q: &[f32],
q_gate: &[f32],
k_cache: &[f32],
v_cache: &[f32],
) -> Vec<f32> {
let mut out = vec![0.0f32; q.len()];
self.0
.sdpa_cpu(kv_len, q, q_gate, k_cache, v_cache, &mut out)
.expect("RsBackend sdpa_cpu");
out
}
fn lm_head_cpu(&self, x: &[f32]) -> Vec<f32> {
let mut out = vec![0.0f32; self.0.n_vocab()];
self.0
.lm_head_cpu(x, &mut out)
.expect("RsBackend lm_head_cpu");
out
}
fn moe_router_cpu(&self, scores: Vec<f32>, k: usize) -> (Vec<i32>, Vec<f32>) {
let mut s = scores;
let mut idx = vec![0i32; k];
let mut w = vec![0.0f32; k];
self.0
.moe_router_cpu(&mut s, k, &mut idx, &mut w)
.expect("RsBackend moe_router_cpu");
(idx, w)
}
fn conv1d_step_cpu(
&self,
weight_name: &str,
channels: usize,
kernel_size: usize,
conv_state: &[f32],
new_input: &[f32],
) -> Vec<f32> {
let mut out = vec![0.0f32; channels];
self.0
.conv1d_step_cpu(
weight_name,
channels,
kernel_size,
conv_state,
new_input,
&mut out,
)
.expect("RsBackend conv1d_step_cpu");
out
}
fn rms_norm_bare_cpu(&self, eps: f32, x: &[f32]) -> Vec<f32> {
let mut out = vec![0.0f32; x.len()];
self.0
.rms_norm_bare_cpu(eps, x, &mut out)
.expect("RsBackend rms_norm_bare_cpu");
out
}
fn rms_norm_gated_cpu(
&self,
weight_name: &str,
eps: f32,
x: &[f32],
z: &[f32],
) -> Vec<f32> {
let mut out = vec![0.0f32; x.len()];
self.0
.rms_norm_gated_cpu(weight_name, eps, x, z, &mut out)
.expect("RsBackend rms_norm_gated_cpu");
out
}
fn gated_delta_recurrence_cpu(
&self,
layer_idx: usize,
alpha: &[f32],
beta: &[f32],
q: &[f32],
k: &[f32],
v: &[f32],
v_heads: usize,
k_heads: usize,
key_dim: usize,
value_dim: usize,
ssm_state_in: Vec<f32>,
) -> (Vec<f32>, Vec<f32>) {
let mut state = ssm_state_in;
let mut out = vec![0.0f32; v_heads * value_dim];
self.0
.gated_delta_recurrence_cpu(
layer_idx,
alpha,
beta,
q,
k,
v,
v_heads,
k_heads,
key_dim,
value_dim,
&mut state,
&mut out,
)
.expect("RsBackend gated_delta_recurrence_cpu");
(state, out)
}
fn load_expert_bytes(&self, layer_idx: i32, expert_idx: i32) -> Vec<u8> {
let mut out = vec![0u8; moeflux::riir::VARIANT.expert_size_4bit()];
self.0
.load_expert_bytes(
layer_idx as usize,
expert_idx as usize,
&mut out,
)
.expect("RsBackend load_expert_bytes");
out
}
fn gpu_rms_norm_fused(
&mut self,
x: &[f32],
weight_bf16: &[u8],
) -> Vec<f32> {
let mut out = vec![0.0f32; moeflux::riir::VARIANT.hidden_dim];
self.0
.gpu_rms_norm_fused(x, weight_bf16, &mut out)
.expect("RsBackend gpu_rms_norm_fused");
out
}
fn gpu_expert_forward(
&mut self,
expert_data: &[u8],
h_post: &[f32],
) -> Vec<f32> {
let mut out = vec![0.0f32; moeflux::riir::VARIANT.hidden_dim];
self.0
.gpu_expert_forward(expert_data, h_post, &mut out)
.expect("RsBackend gpu_expert_forward");
out
}
fn gpu_batched_experts_forward(
&mut self,
actual_k: i32,
expert_data: &[u8],
h_post: &[f32],
h_mid: &[f32],
shared_out: &[f32],
expert_weights: &[f32],
shared_gate_score: f32,
) -> Vec<f32> {
let mut out = vec![0.0f32; moeflux::riir::VARIANT.hidden_dim];
self.0
.gpu_batched_experts_forward(
actual_k,
expert_data,
h_post,
h_mid,
shared_out,
expert_weights,
shared_gate_score,
&mut out,
)
.expect("RsBackend gpu_batched_experts_forward");
out
}
fn attn_scores_batched(
&mut self,
num_heads: u32,
num_kv_heads: u32,
head_dim: u32,
seq_len: u32,
q: &[f32],
k_cache: &[f32],
scale: f32,
) -> Vec<f32> {
let mut out = vec![0.0f32; (num_heads * seq_len) as usize];
self.0
.attn_scores_batched(
num_heads, num_kv_heads, head_dim, seq_len, q, k_cache,
scale, &mut out,
)
.expect("RsBackend attn_scores_batched");
out
}
fn attn_softmax_batched(
&mut self,
num_heads: u32,
seq_len: u32,
scores_in: &[f32],
) -> Vec<f32> {
let mut out = scores_in.to_vec();
self.0
.attn_softmax_batched(num_heads, seq_len, &mut out)
.expect("RsBackend attn_softmax_batched");
out
}
fn attn_values_batched(
&mut self,
num_heads: u32,
num_kv_heads: u32,
head_dim: u32,
seq_len: u32,
scores: &[f32],
v_cache: &[f32],
) -> Vec<f32> {
let mut out = vec![0.0f32; (num_heads * head_dim) as usize];
self.0
.attn_values_batched(
num_heads, num_kv_heads, head_dim, seq_len, scores, v_cache,
&mut out,
)
.expect("RsBackend attn_values_batched");
out
}
fn sigmoid_gate(
&mut self,
dim: u32,
gate: &[f32],
x_in: &[f32],
) -> Vec<f32> {
let mut out = x_in.to_vec();
self.0
.sigmoid_gate(dim, gate, &mut out)
.expect("RsBackend sigmoid_gate");
out
}
fn begin_deferred_experts(
&mut self,
actual_k: i32,
expert_data: &[u8],
h_post: &[f32],
h_mid: &[f32],
shared_out: &[f32],
expert_weights: &[f32],
shared_gate_score: f32,
) {
self.0
.begin_deferred_experts(
actual_k,
expert_data,
h_post,
h_mid,
shared_out,
expert_weights,
shared_gate_score,
-1,
)
.expect("RsBackend begin_deferred_experts");
}
fn complete_deferred_experts(&mut self) -> Vec<f32> {
let mut out = vec![0.0f32; moeflux::riir::VARIANT.hidden_dim];
self.0
.complete_deferred_experts(&mut out)
.expect("RsBackend complete_deferred_experts");
out
}
fn discard_deferred_experts(&mut self) {
self.0.discard_deferred_experts();
}
fn layer_forward_dump(
&mut self,
layer_idx: i32,
pos: i32,
hidden_in: &[f32],
) -> Vec<f32> {
let mut out = vec![0.0f32; moeflux::riir::VARIANT.hidden_dim];
self.0
.layer_forward_dump(layer_idx, pos, hidden_in, &mut out)
.expect("RsBackend layer_forward_dump");
out
}
fn eval_prompt(&mut self, tokens: &[i32], start_pos: usize) -> Vec<f32> {
let mut logits = vec![0.0f32; self.0.n_vocab()];
self.0
.eval_prompt(tokens, start_pos, 0, &mut logits)
.expect("RsBackend eval_prompt");
logits
}
fn eval_token(&mut self, token: i32, pos: usize) -> Vec<f32> {
let mut logits = vec![0.0f32; self.0.n_vocab()];
self.0
.eval_token(token, pos, 0, &mut logits)
.expect("RsBackend eval_token");
logits
}
fn memory_clear(&mut self) {
self.0.memory_clear()
}
fn memory_seq_rm(&mut self, p0: i32, p1: i32) -> bool {
self.0.memory_seq_rm(0, p0, p1)
}
fn memory_seq_pos_max(&self) -> i32 {
self.0.memory_seq_pos_max(0)
}
}
pub fn argmax(logits: &[f32]) -> i32 {
let mut best_id = 0i32;
let mut best_v = f32::NEG_INFINITY;
for (i, &v) in logits.iter().enumerate() {
if v > best_v {
best_v = v;
best_id = i as i32;
}
}
best_id
}
pub fn topk(logits: &[f32], k: usize) -> Vec<i32> {
let mut idx: Vec<(i32, f32)> = logits
.iter()
.enumerate()
.map(|(i, &v)| (i as i32, v))
.collect();
idx.sort_by(|a, b| {
b.1.partial_cmp(&a.1)
.unwrap_or(std::cmp::Ordering::Equal)
.then_with(|| a.0.cmp(&b.0))
});
idx.truncate(k);
idx.into_iter().map(|(i, _)| i).collect()
}
pub fn jaccard(a: &[i32], b: &[i32]) -> f32 {
use std::collections::HashSet;
let sa: HashSet<i32> = a.iter().copied().collect();
let sb: HashSet<i32> = b.iter().copied().collect();
let inter = sa.intersection(&sb).count() as f32;
let union = sa.union(&sb).count() as f32;
if union == 0.0 { 1.0 } else { inter / union }
}
pub fn cosine_sim(a: &[f32], b: &[f32]) -> f32 {
assert_eq!(a.len(), b.len(), "cosine_sim: length mismatch");
let mut dot = 0.0f64;
let mut na = 0.0f64;
let mut nb = 0.0f64;
for (&x, &y) in a.iter().zip(b.iter()) {
let xf = x as f64;
let yf = y as f64;
dot += xf * yf;
na += xf * xf;
nb += yf * yf;
}
let denom = (na * nb).sqrt();
if denom == 0.0 { 1.0 } else { (dot / denom) as f32 }
}
pub const TOPK_K: usize = 20;
pub const TOPK_JACCARD_MIN: f32 = 0.95;
pub const COSINE_SIM_MIN: f32 = 0.99;
pub fn assert_logits_close(label: &str, c: &[f32], rs: &[f32]) {
let c_arg = argmax(c);
let rs_arg = argmax(rs);
let c_top = topk(c, TOPK_K);
let rs_top = topk(rs, TOPK_K);
let jac = jaccard(&c_top, &rs_top);
let cos = cosine_sim(c, rs);
eprintln!(
"[diff:{label}] argmax c={c_arg} rs={rs_arg} \
top-{TOPK_K} jaccard={jac:.4} cosine={cos:.5}"
);
assert_eq!(
c_arg, rs_arg,
"[diff:{label}] argmax mismatch (c={c_arg} rs={rs_arg})"
);
assert!(
jac >= TOPK_JACCARD_MIN,
"[diff:{label}] top-{TOPK_K} jaccard {jac:.4} below {TOPK_JACCARD_MIN}"
);
assert!(
cos >= COSINE_SIM_MIN,
"[diff:{label}] cosine sim {cos:.5} below {COSINE_SIM_MIN}"
);
}
fn artifacts_dir() -> PathBuf {
let default =
"/Volumes/Temp Backup/models/moeflux/qwen3-6-35b-a3b-artifacts";
PathBuf::from(
std::env::var("MOEFLUX_SMOKE_ARTIFACTS").unwrap_or(default.into()),
)
}
fn root_dir() -> PathBuf {
let default =
"/Volumes/Temp Backup/models/moeflux/qwen3-6-35b-a3b-root";
PathBuf::from(std::env::var("MOEFLUX_SMOKE_ROOT").unwrap_or(default.into()))
}
pub fn open_backend<B: DiffBackend>() -> B {
let art = artifacts_dir();
let root = root_dir();
B::open(
&art.join("model_weights.bin"),
&art.join("model_weights.json"),
&art.join("vocab.bin"),
&root,
4,
false,
)
}
#[test]
#[ignore = "long running; needs moeflux artifacts"]
fn weight_file_loads_a3b() {
let art = artifacts_dir();
let wf = moeflux::riir::WeightFile::open(
&art.join("model_weights.bin"),
&art.join("model_weights.json"),
)
.expect("WeightFile::open");
eprintln!(
"[diff:weight_file] {} tensors in {:.2} GB",
wf.len(),
wf.file_size() as f64 / 1e9,
);
assert_eq!(wf.len(), 1397, "tensor count drifted from C");
let embed = wf
.tensor_info("model.embed_tokens.weight")
.expect("model.embed_tokens.weight");
assert!(!embed.dtype.is_empty(), "embed_tokens dtype empty");
eprintln!(
"[diff:weight_file] embed_tokens dtype={} shape={:?} bits={} size={}",
embed.dtype, embed.shape, embed.bits, embed.size,
);
let bytes = wf
.tensor_bytes("model.embed_tokens.weight")
.expect("embed bytes");
assert_eq!(bytes.len() as u64, embed.size);
}
#[test]
#[ignore = "long running; needs moeflux artifacts"]
fn variants_match_c() {
let c: CBackend = open_backend();
common::c_backend::assert_matches_c(&c.0);
eprintln!(
"[diff:variants] {} n_vocab={} n_ctx={} eos={}",
c.model_name(),
c.n_vocab(),
c.n_ctx(),
c.eos(),
);
}
const MAX_ULP_DRIFT: u32 = 128;
fn ulp_diff(a: f32, b: f32) -> u32 {
if a.to_bits() == b.to_bits() {
return 0;
}
if a.is_nan() || b.is_nan() {
return u32::MAX;
}
if a.is_sign_negative() != b.is_sign_negative() {
return u32::MAX;
}
let ai = a.to_bits();
let bi = b.to_bits();
if ai > bi { ai - bi } else { bi - ai }
}
#[test]
#[ignore = "long running; needs moeflux artifacts"]
fn rope_close_c_vs_rust() {
use moeflux::riir::VARIANT;
let c: CBackend = open_backend();
let rs: RsBackend = open_backend();
let head_dim = VARIANT.head_dim;
let q_len = VARIANT.num_attn_heads * head_dim;
let k_len = VARIANT.num_kv_heads * head_dim;
let q_in: Vec<f32> = (0..q_len)
.map(|i| ((i as f32) * 0.013).sin() * 0.7 + 0.1)
.collect();
let k_in: Vec<f32> = (0..k_len)
.map(|i| ((i as f32) * 0.019).cos() * 0.5 - 0.2)
.collect();
let positions: [i32; 5] = [0, 1, 17, 1024, 65535];
for &pos in &positions {
let (c_q, c_k) = c.apply_rotary_emb(pos, &q_in, &k_in);
let (rs_q, rs_k) = rs.apply_rotary_emb(pos, &q_in, &k_in);
let q_max_ulp = c_q
.iter()
.zip(rs_q.iter())
.map(|(&a, &b)| ulp_diff(a, b))
.max()
.unwrap_or(0);
let k_max_ulp = c_k
.iter()
.zip(rs_k.iter())
.map(|(&a, &b)| ulp_diff(a, b))
.max()
.unwrap_or(0);
let q_diff_count = c_q
.iter()
.zip(rs_q.iter())
.filter(|&(&a, &b)| a.to_bits() != b.to_bits())
.count();
let k_diff_count = c_k
.iter()
.zip(rs_k.iter())
.filter(|&(&a, &b)| a.to_bits() != b.to_bits())
.count();
eprintln!(
"[diff:rope pos={pos}] Q max_ulp={q_max_ulp} ({}/{} differ); \
K max_ulp={k_max_ulp} ({}/{} differ)",
q_diff_count,
c_q.len(),
k_diff_count,
c_k.len(),
);
assert!(
q_max_ulp <= MAX_ULP_DRIFT,
"[diff:rope pos={pos}] Q max ULP drift {q_max_ulp} > {MAX_ULP_DRIFT}"
);
assert!(
k_max_ulp <= MAX_ULP_DRIFT,
"[diff:rope pos={pos}] K max ULP drift {k_max_ulp} > {MAX_ULP_DRIFT}"
);
}
}
#[test]
#[ignore = "long running; needs moeflux artifacts"]
fn rms_norm_cpu_bit_exact_c_vs_rust() {
use moeflux::riir::VARIANT;
let c: CBackend = open_backend();
let rs: RsBackend = open_backend();
let token_ids: [i32; 4] = [0, 1, VARIANT.eos_token_1, VARIANT.vocab_size as i32 - 1];
let weight_names: [&str; 3] = [
"model.norm.weight",
"model.layers.0.input_layernorm.weight",
"model.layers.0.post_attention_layernorm.weight",
];
for &tok in &token_ids {
let x = c.embed(tok);
for w_name in &weight_names {
let c_out = c.rms_norm_cpu(w_name, &x);
let rs_out = rs.rms_norm_cpu(w_name, &x);
let diffs: Vec<(usize, f32, f32)> = c_out
.iter()
.zip(rs_out.iter())
.enumerate()
.filter_map(|(i, (&a, &b))| {
if a.to_bits() != b.to_bits() {
Some((i, a, b))
} else {
None
}
})
.collect();
if !diffs.is_empty() {
let first = &diffs[0];
panic!(
"[diff:rms_norm_cpu token={tok} w={w_name}] {} of {} elements differ; \
first at index {} (c={} rs={})",
diffs.len(),
c_out.len(),
first.0,
first.1,
first.2,
);
}
eprintln!(
"[diff:rms_norm_cpu token={tok} w={w_name}] {} elements bit-equal; \
first 4: {:?}",
c_out.len(),
&c_out[..4],
);
}
}
}
#[test]
#[ignore = "long running; needs moeflux artifacts"]
fn sdpa_cpu_close_c_vs_rust() {
use moeflux::riir::VARIANT;
let c: CBackend = open_backend();
let rs: RsBackend = open_backend();
let head_dim = VARIANT.head_dim;
let num_attn_heads = VARIANT.num_attn_heads;
let num_kv_heads = VARIANT.num_kv_heads;
let q_dim = num_attn_heads * head_dim;
let kv_dim = num_kv_heads * head_dim;
let q: Vec<f32> = (0..q_dim)
.map(|i| ((i as f32) * 0.011).sin() * 0.6 + 0.05)
.collect();
let q_gate: Vec<f32> = (0..q_dim)
.map(|i| ((i as f32) * 0.007).cos() * 1.2 - 0.1)
.collect();
for &kv_len in &[1i32, 8, 64, 512] {
let kv_len_u = kv_len as usize;
let kv_total = kv_len_u * kv_dim;
let k_cache: Vec<f32> = (0..kv_total)
.map(|i| ((i as f32) * 0.013).sin() * 0.5 + 0.02)
.collect();
let v_cache: Vec<f32> = (0..kv_total)
.map(|i| ((i as f32) * 0.017).cos() * 0.4 - 0.05)
.collect();
let c_out = c.sdpa_cpu(kv_len, &q, &q_gate, &k_cache, &v_cache);
let rs_out = rs.sdpa_cpu(kv_len, &q, &q_gate, &k_cache, &v_cache);
let max_ulp = c_out
.iter()
.zip(rs_out.iter())
.map(|(&a, &b)| ulp_diff(a, b))
.max()
.unwrap_or(0);
let diff_count = c_out
.iter()
.zip(rs_out.iter())
.filter(|&(&a, &b)| a.to_bits() != b.to_bits())
.count();
let max_abs_diff = c_out
.iter()
.zip(rs_out.iter())
.map(|(&a, &b)| (a - b).abs())
.fold(0.0f32, f32::max);
let max_abs_out = c_out
.iter()
.map(|&a| a.abs())
.fold(0.0f32, f32::max);
let cos = cosine_sim(&c_out, &rs_out);
eprintln!(
"[diff:sdpa kv_len={kv_len}] cosine={cos:.6} max_abs_diff={max_abs_diff:.3e} \
max_abs_out={max_abs_out:.3e} max_ulp={max_ulp} ({}/{} differ)",
diff_count,
c_out.len(),
);
assert!(
cos >= 0.9999,
"[diff:sdpa kv_len={kv_len}] cosine sim {cos:.6} below 0.9999"
);
assert!(
max_abs_diff <= 1e-3 * max_abs_out.max(1e-6),
"[diff:sdpa kv_len={kv_len}] max abs diff {max_abs_diff:.3e} \
> 1e-3 * max abs out ({max_abs_out:.3e})"
);
}
}
#[test]
#[ignore = "long running; needs moeflux artifacts"]
fn rms_norm_per_head_cpu_bit_exact_c_vs_rust() {
use moeflux::riir::VARIANT;
let c: CBackend = open_backend();
let rs: RsBackend = open_backend();
let head_dim = VARIANT.head_dim;
let num_attn_heads = VARIANT.num_attn_heads;
let num_kv_heads = VARIANT.num_kv_heads;
let mut fa_layer: Option<usize> = None;
for i in 0..VARIANT.num_layers {
if (i + 1) % VARIANT.full_attn_interval == 0 {
fa_layer = Some(i);
break;
}
}
let fa_layer = fa_layer.expect("at least one full-attention layer");
let q_norm_name =
format!("model.layers.{fa_layer}.self_attn.q_norm.weight");
let k_norm_name =
format!("model.layers.{fa_layer}.self_attn.k_norm.weight");
let q_in: Vec<f32> = (0..num_attn_heads * head_dim)
.map(|i| ((i as f32) * 0.011).sin() * 0.6 + 0.05)
.collect();
let k_in: Vec<f32> = (0..num_kv_heads * head_dim)
.map(|i| ((i as f32) * 0.023).cos() * 0.4 - 0.1)
.collect();
for (label, w_name, num_heads, x_in) in [
("Q", q_norm_name.as_str(), num_attn_heads, &q_in),
("K", k_norm_name.as_str(), num_kv_heads, &k_in),
] {
let c_out = c.rms_norm_per_head_cpu(w_name, num_heads, head_dim, x_in);
let rs_out = rs.rms_norm_per_head_cpu(w_name, num_heads, head_dim, x_in);
let diffs: Vec<(usize, f32, f32)> = c_out
.iter()
.zip(rs_out.iter())
.enumerate()
.filter_map(|(i, (&a, &b))| {
if a.to_bits() != b.to_bits() {
Some((i, a, b))
} else {
None
}
})
.collect();
if !diffs.is_empty() {
let first = &diffs[0];
panic!(
"[diff:rms_norm_per_head_cpu {label} layer={fa_layer}] {} of {} elements differ; \
first at index {} (c={} rs={})",
diffs.len(),
c_out.len(),
first.0,
first.1,
first.2,
);
}
eprintln!(
"[diff:rms_norm_per_head_cpu {label} layer={fa_layer} \
num_heads={num_heads} head_dim={head_dim}] {} elements bit-equal; \
first 4: {:?}",
c_out.len(),
&c_out[..4],
);
}
}
#[test]
#[ignore = "long running; needs moeflux artifacts"]
fn embed_bit_exact_c_vs_rust() {
use moeflux::riir::VARIANT;
let c: CBackend = open_backend();
let rs: RsBackend = open_backend();
let vocab = VARIANT.vocab_size as i32;
let token_ids: [i32; 8] = [
0,
1,
VARIANT.eos_token_1,
VARIANT.eos_token_2,
VARIANT.think_start_token,
VARIANT.think_end_token,
vocab / 2,
vocab - 1,
];
for &tok in &token_ids {
let c_emb = c.embed(tok);
let rs_emb = rs.embed(tok);
assert_eq!(
c_emb.len(),
VARIANT.hidden_dim,
"[diff:embed token={tok}] C output length mismatch"
);
assert_eq!(
rs_emb.len(),
VARIANT.hidden_dim,
"[diff:embed token={tok}] Rust output length mismatch"
);
let diffs: Vec<(usize, f32, f32)> = c_emb
.iter()
.zip(rs_emb.iter())
.enumerate()
.filter_map(|(i, (&a, &b))| {
if a.to_bits() != b.to_bits() {
Some((i, a, b))
} else {
None
}
})
.collect();
if !diffs.is_empty() {
let first = &diffs[0];
panic!(
"[diff:embed token={tok}] {} of {} elements differ; \
first at index {} (c={} rs={})",
diffs.len(),
c_emb.len(),
first.0,
first.1,
first.2,
);
}
eprintln!(
"[diff:embed token={tok}] {} elements bit-equal; first 4: {:?}",
c_emb.len(),
&c_emb[..4],
);
}
}
#[test]
#[ignore = "long running; needs moeflux artifacts"]
fn lm_head_cpu_bit_exact_c_vs_rust() {
use moeflux::riir::VARIANT;
let c: CBackend = open_backend();
let rs: RsBackend = open_backend();
let hidden_dim = VARIANT.hidden_dim;
let x_synth: Vec<f32> = (0..hidden_dim)
.map(|i| ((i as f32) * 0.011).sin() * 0.6 + 0.05)
.collect();
let emb = c.embed(VARIANT.eos_token_1);
let x_real = c.rms_norm_cpu("model.norm.weight", &emb);
for (label, x) in [("synth", &x_synth), ("real", &x_real)] {
let c_out = c.lm_head_cpu(x);
let rs_out = rs.lm_head_cpu(x);
assert_eq!(c_out.len(), VARIANT.vocab_size, "C output len");
assert_eq!(rs_out.len(), VARIANT.vocab_size, "Rust output len");
let diffs: Vec<(usize, f32, f32)> = c_out
.iter()
.zip(rs_out.iter())
.enumerate()
.filter_map(|(i, (&a, &b))| {
if a.to_bits() != b.to_bits() {
Some((i, a, b))
} else {
None
}
})
.collect();
if !diffs.is_empty() {
let max_ulp = c_out
.iter()
.zip(rs_out.iter())
.map(|(&a, &b)| ulp_diff(a, b))
.max()
.unwrap_or(0);
let max_abs_diff = c_out
.iter()
.zip(rs_out.iter())
.map(|(&a, &b)| (a - b).abs())
.fold(0.0f32, f32::max);
let max_abs_out = c_out
.iter()
.map(|&a| a.abs())
.fold(0.0f32, f32::max);
let cos = cosine_sim(&c_out, &rs_out);
let first = &diffs[0];
panic!(
"[diff:lm_head {label}] {} of {} elements differ; \
max_ulp={max_ulp} max_abs_diff={max_abs_diff:.3e} \
max_abs_out={max_abs_out:.3e} cosine={cos:.6}; \
first at index {} (c={} rs={})",
diffs.len(),
c_out.len(),
first.0,
first.1,
first.2,
);
}
eprintln!(
"[diff:lm_head {label}] {} elements bit-equal; \
argmax={} first 4: {:?}",
c_out.len(),
argmax(&c_out),
&c_out[..4],
);
}
}
#[test]
#[ignore = "long running; needs moeflux artifacts"]
fn moe_router_cpu_close_c_vs_rust() {
use moeflux::riir::VARIANT;
let c: CBackend = open_backend();
let rs: RsBackend = open_backend();
let n_experts = VARIANT.num_experts;
let k = VARIANT.num_experts_per_tok;
assert!(n_experts >= k && k >= 1);
let mut clear: Vec<f32> = (0..n_experts)
.map(|i| ((i as f32) * 0.011).sin() * 0.01 + 0.0)
.collect();
let bump_indices: Vec<usize> =
(0..k).map(|i| (i * (n_experts / k.max(1))) % n_experts).collect();
for (slot, &idx) in bump_indices.iter().enumerate() {
clear[idx] = 5.0 + slot as f32 * 0.5;
}
let spread: Vec<f32> = (0..n_experts)
.map(|i| ((i as f32) * 0.013).cos() * 0.7 + ((i as f32) * 0.041).sin() * 0.3)
.collect();
for (label, scores) in [("clear", &clear), ("spread", &spread)] {
let (c_idx, c_w) = c.moe_router_cpu(scores.clone(), k);
let (rs_idx, rs_w) = rs.moe_router_cpu(scores.clone(), k);
let mut c_sorted = c_idx.clone();
let mut rs_sorted = rs_idx.clone();
c_sorted.sort();
rs_sorted.sort();
assert_eq!(
c_sorted, rs_sorted,
"[diff:moe_router {label}] index set mismatch (c={c_idx:?} rs={rs_idx:?})"
);
let c_pairs: Vec<(i32, f32)> =
c_idx.iter().copied().zip(c_w.iter().copied()).collect();
let rs_pairs: Vec<(i32, f32)> =
rs_idx.iter().copied().zip(rs_w.iter().copied()).collect();
let mut c_by_idx: std::collections::HashMap<i32, f32> = c_pairs.into_iter().collect();
let mut rs_by_idx: std::collections::HashMap<i32, f32> = rs_pairs.into_iter().collect();
let mut max_ulp = 0u32;
let mut max_abs_diff = 0.0f32;
for &idx in &c_sorted {
let cw = c_by_idx.remove(&idx).unwrap();
let rw = rs_by_idx.remove(&idx).unwrap();
max_ulp = max_ulp.max(ulp_diff(cw, rw));
max_abs_diff = max_abs_diff.max((cw - rw).abs());
}
let weight_sum_c: f32 = c_w.iter().sum();
let weight_sum_rs: f32 = rs_w.iter().sum();
eprintln!(
"[diff:moe_router {label} n={n_experts} k={k}] \
max_ulp={max_ulp} max_abs_diff={max_abs_diff:.3e} \
c_sum={weight_sum_c:.6} rs_sum={weight_sum_rs:.6}"
);
assert!(
max_ulp <= MAX_ULP_DRIFT,
"[diff:moe_router {label}] max ULP drift {max_ulp} > {MAX_ULP_DRIFT}"
);
assert!(
(weight_sum_c - 1.0).abs() < 1e-5,
"[diff:moe_router {label}] C weights don't sum to 1 ({weight_sum_c})"
);
assert!(
(weight_sum_rs - 1.0).abs() < 1e-5,
"[diff:moe_router {label}] Rust weights don't sum to 1 ({weight_sum_rs})"
);
}
}
#[test]
#[ignore = "long running; needs moeflux artifacts"]
fn rms_norm_bare_cpu_bit_exact_c_vs_rust() {
use moeflux::riir::variants::Variant;
let c: CBackend = open_backend();
let rs: RsBackend = open_backend();
let dim = Variant::LINEAR_KEY_DIM;
let x: Vec<f32> = (0..dim)
.map(|i| ((i as f32) * 0.011).sin() * 0.6 + 0.05)
.collect();
let eps = 1e-6f32;
let c_out = c.rms_norm_bare_cpu(eps, &x);
let rs_out = rs.rms_norm_bare_cpu(eps, &x);
let diffs: Vec<(usize, f32, f32)> = c_out
.iter()
.zip(rs_out.iter())
.enumerate()
.filter_map(|(i, (&a, &b))| {
if a.to_bits() != b.to_bits() {
Some((i, a, b))
} else {
None
}
})
.collect();
if !diffs.is_empty() {
let first = &diffs[0];
panic!(
"[diff:rms_norm_bare] {} of {} elements differ; \
first at index {} (c={} rs={})",
diffs.len(),
c_out.len(),
first.0,
first.1,
first.2,
);
}
eprintln!(
"[diff:rms_norm_bare dim={dim}] {} elements bit-equal; first 4: {:?}",
c_out.len(),
&c_out[..4],
);
}
#[test]
#[ignore = "long running; needs moeflux artifacts"]
fn conv1d_step_cpu_close_c_vs_rust() {
use moeflux::riir::variants::Variant;
use moeflux::riir::VARIANT;
let c: CBackend = open_backend();
let rs: RsBackend = open_backend();
let channels = VARIANT.linear_conv_dim();
let kernel_size = Variant::CONV_KERNEL_SIZE;
let weight_name = "model.layers.0.linear_attn.conv1d.weight";
let conv_state: Vec<f32> = (0..(kernel_size - 1) * channels)
.map(|i| ((i as f32) * 0.013).sin() * 0.4 + 0.02)
.collect();
let new_input: Vec<f32> = (0..channels)
.map(|i| ((i as f32) * 0.019).cos() * 0.5 - 0.1)
.collect();
let c_out =
c.conv1d_step_cpu(weight_name, channels, kernel_size, &conv_state, &new_input);
let rs_out =
rs.conv1d_step_cpu(weight_name, channels, kernel_size, &conv_state, &new_input);
let max_ulp = c_out
.iter()
.zip(rs_out.iter())
.map(|(&a, &b)| ulp_diff(a, b))
.max()
.unwrap_or(0);
let diff_count = c_out
.iter()
.zip(rs_out.iter())
.filter(|&(&a, &b)| a.to_bits() != b.to_bits())
.count();
let max_abs_diff = c_out
.iter()
.zip(rs_out.iter())
.map(|(&a, &b)| (a - b).abs())
.fold(0.0f32, f32::max);
let max_abs_out =
c_out.iter().map(|&a| a.abs()).fold(0.0f32, f32::max);
eprintln!(
"[diff:conv1d_step channels={channels} kernel={kernel_size}] \
max_ulp={max_ulp} max_abs_diff={max_abs_diff:.3e} \
max_abs_out={max_abs_out:.3e} ({diff_count}/{} differ)",
c_out.len(),
);
assert!(
max_ulp <= MAX_ULP_DRIFT,
"[diff:conv1d_step] max ULP drift {max_ulp} > {MAX_ULP_DRIFT}"
);
}
#[test]
#[ignore = "long running; needs moeflux artifacts"]
fn rms_norm_gated_cpu_close_c_vs_rust() {
use moeflux::riir::variants::Variant;
let c: CBackend = open_backend();
let rs: RsBackend = open_backend();
let dim = Variant::LINEAR_VALUE_DIM;
let weight_name = "model.layers.0.linear_attn.norm.weight";
let eps = 1e-6f32;
let x: Vec<f32> = (0..dim)
.map(|i| ((i as f32) * 0.011).sin() * 0.6 + 0.05)
.collect();
let z: Vec<f32> = (0..dim)
.map(|i| ((i as f32) * 0.017).cos() * 1.2 - 0.1)
.collect();
let c_out = c.rms_norm_gated_cpu(weight_name, eps, &x, &z);
let rs_out = rs.rms_norm_gated_cpu(weight_name, eps, &x, &z);
let max_ulp = c_out
.iter()
.zip(rs_out.iter())
.map(|(&a, &b)| ulp_diff(a, b))
.max()
.unwrap_or(0);
let diff_count = c_out
.iter()
.zip(rs_out.iter())
.filter(|&(&a, &b)| a.to_bits() != b.to_bits())
.count();
let max_abs_diff = c_out
.iter()
.zip(rs_out.iter())
.map(|(&a, &b)| (a - b).abs())
.fold(0.0f32, f32::max);
let max_abs_out =
c_out.iter().map(|&a| a.abs()).fold(0.0f32, f32::max);
eprintln!(
"[diff:rms_norm_gated dim={dim}] max_ulp={max_ulp} \
max_abs_diff={max_abs_diff:.3e} max_abs_out={max_abs_out:.3e} \
({diff_count}/{} differ)",
c_out.len(),
);
assert!(
max_ulp <= MAX_ULP_DRIFT,
"[diff:rms_norm_gated] max ULP drift {max_ulp} > {MAX_ULP_DRIFT}"
);
}
#[test]
#[ignore = "long running; needs moeflux artifacts"]
fn gated_delta_recurrence_cpu_close_c_vs_rust() {
use moeflux::riir::variants::Variant;
use moeflux::riir::VARIANT;
let c: CBackend = open_backend();
let rs: RsBackend = open_backend();
let v_heads = VARIANT.linear_num_v_heads;
let k_heads = VARIANT.linear_num_k_heads;
let key_dim = Variant::LINEAR_KEY_DIM;
let value_dim = Variant::LINEAR_VALUE_DIM;
let layer_idx = 0usize;
let alpha: Vec<f32> = (0..v_heads)
.map(|i| ((i as f32) * 0.013).sin() * 0.3)
.collect();
let beta: Vec<f32> = (0..v_heads)
.map(|i| ((i as f32) * 0.017).cos() * 0.5)
.collect();
let q: Vec<f32> = (0..k_heads * key_dim)
.map(|i| ((i as f32) * 0.011).sin() * 0.4 + 0.05)
.collect();
let k_in: Vec<f32> = (0..k_heads * key_dim)
.map(|i| ((i as f32) * 0.019).cos() * 0.5 - 0.1)
.collect();
let v_in: Vec<f32> = (0..v_heads * value_dim)
.map(|i| ((i as f32) * 0.023).sin() * 0.6 + 0.02)
.collect();
let ssm_state_in: Vec<f32> = vec![0.0f32; v_heads * value_dim * key_dim];
let (c_state, c_out) = c.gated_delta_recurrence_cpu(
layer_idx, &alpha, &beta, &q, &k_in, &v_in,
v_heads, k_heads, key_dim, value_dim, ssm_state_in.clone(),
);
let (rs_state, rs_out) = rs.gated_delta_recurrence_cpu(
layer_idx, &alpha, &beta, &q, &k_in, &v_in,
v_heads, k_heads, key_dim, value_dim, ssm_state_in,
);
let state_max_ulp = c_state
.iter()
.zip(rs_state.iter())
.map(|(&a, &b)| ulp_diff(a, b))
.max()
.unwrap_or(0);
let state_diff_count = c_state
.iter()
.zip(rs_state.iter())
.filter(|&(&a, &b)| a.to_bits() != b.to_bits())
.count();
let out_max_ulp = c_out
.iter()
.zip(rs_out.iter())
.map(|(&a, &b)| ulp_diff(a, b))
.max()
.unwrap_or(0);
let out_diff_count = c_out
.iter()
.zip(rs_out.iter())
.filter(|&(&a, &b)| a.to_bits() != b.to_bits())
.count();
let out_max_abs = c_out
.iter()
.map(|&a| a.abs())
.fold(0.0f32, f32::max);
let out_max_diff = c_out
.iter()
.zip(rs_out.iter())
.map(|(&a, &b)| (a - b).abs())
.fold(0.0f32, f32::max);
eprintln!(
"[diff:gated_delta_recurrence layer={layer_idx} v_heads={v_heads} \
k_heads={k_heads} key_dim={key_dim} value_dim={value_dim}] \
state max_ulp={state_max_ulp} ({}/{} differ); \
out max_ulp={out_max_ulp} max_abs_diff={:.3e} max_abs_out={:.3e} \
({}/{} differ)",
state_diff_count,
c_state.len(),
out_max_diff,
out_max_abs,
out_diff_count,
c_out.len(),
);
assert!(
state_max_ulp <= MAX_ULP_DRIFT,
"[diff:gated_delta_recurrence] state max ULP drift \
{state_max_ulp} > {MAX_ULP_DRIFT}"
);
assert!(
out_max_ulp <= MAX_ULP_DRIFT,
"[diff:gated_delta_recurrence] out max ULP drift \
{out_max_ulp} > {MAX_ULP_DRIFT}"
);
}
#[test]
#[ignore = "long running; needs Metal device + moeflux artifacts"]
fn gpu_expert_forward_close_c_vs_rust() {
use moeflux::riir::expert_forward::synth;
use moeflux::riir::VARIANT;
let mut c: CBackend = open_backend();
let mut rs: RsBackend = open_backend();
let expert_data = synth::expert_data_seeded();
let h_post = synth::h_post_seeded();
assert_eq!(expert_data.len(), VARIANT.expert_size_4bit());
assert_eq!(h_post.len(), VARIANT.hidden_dim);
let c_out = c.gpu_expert_forward(&expert_data, &h_post);
let rs_out = rs.gpu_expert_forward(&expert_data, &h_post);
assert_eq!(c_out.len(), VARIANT.hidden_dim);
assert_eq!(rs_out.len(), VARIANT.hidden_dim);
let cos = cosine_sim(&c_out, &rs_out);
let max_abs_out = c_out
.iter()
.chain(rs_out.iter())
.map(|x| x.abs())
.fold(0.0f32, f32::max);
let max_abs_diff = c_out
.iter()
.zip(rs_out.iter())
.map(|(a, b)| (a - b).abs())
.fold(0.0f32, f32::max);
let rel = if max_abs_out > 0.0 {
max_abs_diff / max_abs_out
} else {
0.0
};
eprintln!(
"[diff:gpu_expert_forward] cosine={cos:.7} \
max_abs_diff={max_abs_diff:.3e} max_abs_out={max_abs_out:.3e} \
relative={rel:.3e}"
);
assert!(
c_out.iter().all(|x| x.is_finite()),
"[diff:gpu_expert_forward] C output has NaN/Inf"
);
assert!(
rs_out.iter().all(|x| x.is_finite()),
"[diff:gpu_expert_forward] Rust output has NaN/Inf"
);
assert!(
c_out.iter().any(|&x| x != 0.0),
"[diff:gpu_expert_forward] C output is all zero"
);
assert!(
rs_out.iter().any(|&x| x != 0.0),
"[diff:gpu_expert_forward] Rust output is all zero"
);
const COSINE_FLOOR: f32 = 0.9999;
const REL_DIFF_FLOOR: f32 = 1e-3;
assert!(
cos >= COSINE_FLOOR,
"[diff:gpu_expert_forward] cosine {cos:.7} below {COSINE_FLOOR}"
);
assert!(
rel <= REL_DIFF_FLOOR,
"[diff:gpu_expert_forward] relative max_abs_diff {rel:.3e} \
above {REL_DIFF_FLOOR:.3e}"
);
}
#[test]
#[ignore = "long running; needs Metal device + moeflux artifacts"]
fn gpu_rms_norm_fused_close_c_vs_rust() {
use moeflux::riir::VARIANT;
let mut c: CBackend = open_backend();
let mut rs: RsBackend = open_backend();
let art = artifacts_dir();
let wf = moeflux::riir::WeightFile::open(
&art.join("model_weights.bin"),
&art.join("model_weights.json"),
)
.expect("WeightFile::open");
let weight = wf
.tensor_bytes("model.norm.weight")
.expect("model.norm.weight present in manifest");
assert_eq!(weight.len(), VARIANT.hidden_dim * 2);
let x: Vec<f32> = (0..VARIANT.hidden_dim)
.map(|i| (i as f32 * 0.013).sin() * 0.5 + 0.1)
.collect();
let c_out = c.gpu_rms_norm_fused(&x, weight);
let rs_out = rs.gpu_rms_norm_fused(&x, weight);
assert_eq!(c_out.len(), VARIANT.hidden_dim);
assert_eq!(rs_out.len(), VARIANT.hidden_dim);
let cos = cosine_sim(&c_out, &rs_out);
let max_abs_out = c_out
.iter()
.chain(rs_out.iter())
.map(|x| x.abs())
.fold(0.0f32, f32::max);
let max_abs_diff = c_out
.iter()
.zip(rs_out.iter())
.map(|(a, b)| (a - b).abs())
.fold(0.0f32, f32::max);
let rel = if max_abs_out > 0.0 {
max_abs_diff / max_abs_out
} else {
0.0
};
eprintln!(
"[diff:gpu_rms_norm_fused] cosine={cos:.7} \
max_abs_diff={max_abs_diff:.3e} max_abs_out={max_abs_out:.3e} \
relative={rel:.3e}"
);
assert!(
c_out.iter().all(|x| x.is_finite()),
"[diff:gpu_rms_norm_fused] C output has NaN/Inf"
);
assert!(
rs_out.iter().all(|x| x.is_finite()),
"[diff:gpu_rms_norm_fused] Rust output has NaN/Inf"
);
const COSINE_FLOOR: f32 = 0.9999;
const REL_DIFF_FLOOR: f32 = 1e-3;
assert!(
cos >= COSINE_FLOOR,
"[diff:gpu_rms_norm_fused] cosine {cos:.7} below {COSINE_FLOOR}"
);
assert!(
rel <= REL_DIFF_FLOOR,
"[diff:gpu_rms_norm_fused] relative max_abs_diff {rel:.3e} \
above {REL_DIFF_FLOOR:.3e}"
);
}
#[test]
#[ignore = "long running; needs moeflux artifacts"]
fn load_expert_bytes_byte_exact_c_vs_rust() {
use moeflux::riir::VARIANT;
let c: CBackend = open_backend();
let rs: RsBackend = open_backend();
let v = VARIANT;
let probes: &[(i32, i32)] = &[
(0, 0),
(0, 7),
(3, 0),
(3, 255),
((v.num_layers / 2) as i32, 42),
((v.num_layers - 1) as i32, (v.num_experts - 1) as i32),
];
for &(layer, expert) in probes {
let c_bytes = c.load_expert_bytes(layer, expert);
let rs_bytes = rs.load_expert_bytes(layer, expert);
assert_eq!(c_bytes.len(), v.expert_size_4bit());
assert_eq!(rs_bytes.len(), v.expert_size_4bit());
let first_diff =
c_bytes.iter().zip(rs_bytes.iter()).position(|(a, b)| a != b);
eprintln!(
"[diff:load_expert_bytes layer={layer} expert={expert}] \
{} bytes; first_diff={first_diff:?}",
c_bytes.len(),
);
assert!(
first_diff.is_none(),
"[diff:load_expert_bytes layer={layer} expert={expert}] \
byte mismatch at offset {first_diff:?} \
(c=0x{:02x} rs=0x{:02x})",
first_diff.map(|i| c_bytes[i]).unwrap_or(0),
first_diff.map(|i| rs_bytes[i]).unwrap_or(0),
);
}
}
#[test]
#[ignore = "long running; needs Metal device + moeflux artifacts"]
fn gpu_batched_experts_forward_close_c_vs_rust() {
use moeflux::riir::expert_forward::synth;
use moeflux::riir::VARIANT;
let mut c: CBackend = open_backend();
let mut rs: RsBackend = open_backend();
let k: usize = 4;
let expert_data = synth::k_expert_data_seeded(k);
let h_post = synth::h_post_seeded();
let h_mid = synth::h_mid_seeded();
let shared_out = synth::shared_out_seeded();
let weights = synth::expert_weights_seeded(k);
let shared_gate_score: f32 = -1.0;
assert_eq!(expert_data.len(), k * VARIANT.expert_size_4bit());
assert_eq!(weights.len(), k);
let weight_sum: f32 = weights.iter().sum();
assert!(
(weight_sum - 1.0).abs() < 1e-5,
"synth weights don't sum to 1: {weight_sum}"
);
let c_out = c.gpu_batched_experts_forward(
k as i32,
&expert_data,
&h_post,
&h_mid,
&shared_out,
&weights,
shared_gate_score,
);
let rs_out = rs.gpu_batched_experts_forward(
k as i32,
&expert_data,
&h_post,
&h_mid,
&shared_out,
&weights,
shared_gate_score,
);
assert_eq!(c_out.len(), VARIANT.hidden_dim);
assert_eq!(rs_out.len(), VARIANT.hidden_dim);
let cos = cosine_sim(&c_out, &rs_out);
let max_abs_out = c_out
.iter()
.chain(rs_out.iter())
.map(|x| x.abs())
.fold(0.0f32, f32::max);
let max_abs_diff = c_out
.iter()
.zip(rs_out.iter())
.map(|(a, b)| (a - b).abs())
.fold(0.0f32, f32::max);
let rel = if max_abs_out > 0.0 {
max_abs_diff / max_abs_out
} else {
0.0
};
eprintln!(
"[diff:gpu_batched_experts_forward k={k}] cosine={cos:.7} \
max_abs_diff={max_abs_diff:.3e} max_abs_out={max_abs_out:.3e} \
relative={rel:.3e}"
);
assert!(
c_out.iter().all(|x| x.is_finite()),
"[diff:gpu_batched_experts_forward] C output has NaN/Inf"
);
assert!(
rs_out.iter().all(|x| x.is_finite()),
"[diff:gpu_batched_experts_forward] Rust output has NaN/Inf"
);
assert!(
c_out.iter().any(|&x| x != 0.0),
"[diff:gpu_batched_experts_forward] C output is all zero"
);
assert!(
rs_out.iter().any(|&x| x != 0.0),
"[diff:gpu_batched_experts_forward] Rust output is all zero"
);
const COSINE_FLOOR: f32 = 0.9999;
const REL_DIFF_FLOOR: f32 = 1e-3;
assert!(
cos >= COSINE_FLOOR,
"[diff:gpu_batched_experts_forward] cosine {cos:.7} below {COSINE_FLOOR}"
);
assert!(
rel <= REL_DIFF_FLOOR,
"[diff:gpu_batched_experts_forward] relative max_abs_diff \
{rel:.3e} above {REL_DIFF_FLOOR:.3e}"
);
}
fn synth_floats(seed: u32, n: usize, scale: f32) -> Vec<f32> {
(0..n)
.map(|i| {
let phase = (seed as f32 * 0.13) + (i as f32 * 0.017);
phase.sin() * scale + (phase * 1.7).cos() * (scale * 0.5)
})
.collect()
}
#[test]
#[ignore = "long running; needs Metal device + moeflux artifacts"]
fn attn_scores_close_c_vs_rust() {
use moeflux::riir::VARIANT;
let mut c: CBackend = open_backend();
let mut rs: RsBackend = open_backend();
let v = VARIANT;
let num_heads = v.num_attn_heads as u32;
let num_kv_heads = v.num_kv_heads as u32;
let head_dim = v.head_dim as u32;
let kv_dim = num_kv_heads * head_dim;
let scale = 1.0 / (head_dim as f32).sqrt();
for &seq_len in &[32u32, 64, 128, 512] {
let q = synth_floats(seq_len, (num_heads * head_dim) as usize, 0.4);
let k_cache =
synth_floats(seq_len + 1, (seq_len * kv_dim) as usize, 0.3);
let c_out = c.attn_scores_batched(
num_heads, num_kv_heads, head_dim, seq_len, &q, &k_cache, scale,
);
let rs_out = rs.attn_scores_batched(
num_heads, num_kv_heads, head_dim, seq_len, &q, &k_cache, scale,
);
assert_eq!(c_out.len(), (num_heads * seq_len) as usize);
assert_eq!(rs_out.len(), c_out.len());
let cos = cosine_sim(&c_out, &rs_out);
let max_abs_out = c_out
.iter()
.chain(rs_out.iter())
.map(|x| x.abs())
.fold(0.0f32, f32::max);
let max_abs_diff = c_out
.iter()
.zip(rs_out.iter())
.map(|(a, b)| (a - b).abs())
.fold(0.0f32, f32::max);
let rel = if max_abs_out > 0.0 {
max_abs_diff / max_abs_out
} else {
0.0
};
eprintln!(
"[diff:attn_scores seq_len={seq_len}] cosine={cos:.7} \
max_abs_diff={max_abs_diff:.3e} \
max_abs_out={max_abs_out:.3e} relative={rel:.3e}"
);
assert!(
c_out.iter().all(|x| x.is_finite()),
"[diff:attn_scores seq_len={seq_len}] C output has NaN/Inf"
);
assert!(
rs_out.iter().all(|x| x.is_finite()),
"[diff:attn_scores seq_len={seq_len}] Rust output has NaN/Inf"
);
const COSINE_FLOOR: f32 = 0.9999;
const REL_DIFF_FLOOR: f32 = 1e-3;
assert!(
cos >= COSINE_FLOOR,
"[diff:attn_scores seq_len={seq_len}] cosine {cos:.7} \
below {COSINE_FLOOR}"
);
assert!(
rel <= REL_DIFF_FLOOR,
"[diff:attn_scores seq_len={seq_len}] relative max_abs_diff \
{rel:.3e} above {REL_DIFF_FLOOR:.3e}"
);
}
}
#[test]
#[ignore = "long running; needs Metal device + moeflux artifacts"]
fn attn_softmax_close_c_vs_rust() {
use moeflux::riir::VARIANT;
let mut c: CBackend = open_backend();
let mut rs: RsBackend = open_backend();
let v = VARIANT;
let num_heads = v.num_attn_heads as u32;
for &seq_len in &[32u32, 64, 128, 512] {
let scores =
synth_floats(seq_len * 7, (num_heads * seq_len) as usize, 1.5);
let c_out = c.attn_softmax_batched(num_heads, seq_len, &scores);
let rs_out = rs.attn_softmax_batched(num_heads, seq_len, &scores);
assert_eq!(c_out.len(), (num_heads * seq_len) as usize);
for h in 0..num_heads as usize {
let row = &c_out[h * seq_len as usize..(h + 1) * seq_len as usize];
let sum: f32 = row.iter().sum();
assert!(
(sum - 1.0).abs() < 1e-3,
"[diff:attn_softmax seq_len={seq_len}] row {h} sum {sum} \
not ~1"
);
}
let cos = cosine_sim(&c_out, &rs_out);
let max_abs_out = c_out
.iter()
.chain(rs_out.iter())
.map(|x| x.abs())
.fold(0.0f32, f32::max);
let max_abs_diff = c_out
.iter()
.zip(rs_out.iter())
.map(|(a, b)| (a - b).abs())
.fold(0.0f32, f32::max);
let rel = if max_abs_out > 0.0 {
max_abs_diff / max_abs_out
} else {
0.0
};
eprintln!(
"[diff:attn_softmax seq_len={seq_len}] cosine={cos:.7} \
max_abs_diff={max_abs_diff:.3e} \
max_abs_out={max_abs_out:.3e} relative={rel:.3e}"
);
assert!(
c_out.iter().all(|x| x.is_finite()),
"[diff:attn_softmax seq_len={seq_len}] C output has NaN/Inf"
);
assert!(
rs_out.iter().all(|x| x.is_finite()),
"[diff:attn_softmax seq_len={seq_len}] Rust output has NaN/Inf"
);
const COSINE_FLOOR: f32 = 0.9999;
const REL_DIFF_FLOOR: f32 = 1e-3;
assert!(
cos >= COSINE_FLOOR,
"[diff:attn_softmax seq_len={seq_len}] cosine {cos:.7} \
below {COSINE_FLOOR}"
);
assert!(
rel <= REL_DIFF_FLOOR,
"[diff:attn_softmax seq_len={seq_len}] relative max_abs_diff \
{rel:.3e} above {REL_DIFF_FLOOR:.3e}"
);
}
}
#[test]
#[ignore = "long running; needs Metal device + moeflux artifacts"]
fn attn_values_close_c_vs_rust() {
use moeflux::riir::VARIANT;
let mut c: CBackend = open_backend();
let mut rs: RsBackend = open_backend();
let v = VARIANT;
let num_heads = v.num_attn_heads as u32;
let num_kv_heads = v.num_kv_heads as u32;
let head_dim = v.head_dim as u32;
let kv_dim = num_kv_heads * head_dim;
for &seq_len in &[32u32, 64, 128, 512] {
let raw = synth_floats(seq_len + 11, (num_heads * seq_len) as usize, 1.0);
let mut scores = vec![0.0f32; raw.len()];
for h in 0..num_heads as usize {
let row_start = h * seq_len as usize;
let row = &raw[row_start..row_start + seq_len as usize];
let max = row.iter().cloned().fold(f32::NEG_INFINITY, f32::max);
let exps: Vec<f32> =
row.iter().map(|&x| (x - max).exp()).collect();
let sum: f32 = exps.iter().sum();
for i in 0..seq_len as usize {
scores[row_start + i] = exps[i] / sum;
}
}
let v_cache =
synth_floats(seq_len + 13, (seq_len * kv_dim) as usize, 0.5);
let c_out = c.attn_values_batched(
num_heads, num_kv_heads, head_dim, seq_len, &scores, &v_cache,
);
let rs_out = rs.attn_values_batched(
num_heads, num_kv_heads, head_dim, seq_len, &scores, &v_cache,
);
assert_eq!(c_out.len(), (num_heads * head_dim) as usize);
assert_eq!(rs_out.len(), c_out.len());
let cos = cosine_sim(&c_out, &rs_out);
let max_abs_out = c_out
.iter()
.chain(rs_out.iter())
.map(|x| x.abs())
.fold(0.0f32, f32::max);
let max_abs_diff = c_out
.iter()
.zip(rs_out.iter())
.map(|(a, b)| (a - b).abs())
.fold(0.0f32, f32::max);
let rel = if max_abs_out > 0.0 {
max_abs_diff / max_abs_out
} else {
0.0
};
eprintln!(
"[diff:attn_values seq_len={seq_len}] cosine={cos:.7} \
max_abs_diff={max_abs_diff:.3e} \
max_abs_out={max_abs_out:.3e} relative={rel:.3e}"
);
assert!(
c_out.iter().all(|x| x.is_finite()),
"[diff:attn_values seq_len={seq_len}] C output has NaN/Inf"
);
assert!(
rs_out.iter().all(|x| x.is_finite()),
"[diff:attn_values seq_len={seq_len}] Rust output has NaN/Inf"
);
const COSINE_FLOOR: f32 = 0.9999;
const REL_DIFF_FLOOR: f32 = 1e-3;
assert!(
cos >= COSINE_FLOOR,
"[diff:attn_values seq_len={seq_len}] cosine {cos:.7} \
below {COSINE_FLOOR}"
);
assert!(
rel <= REL_DIFF_FLOOR,
"[diff:attn_values seq_len={seq_len}] relative max_abs_diff \
{rel:.3e} above {REL_DIFF_FLOOR:.3e}"
);
}
}
#[test]
#[ignore = "long running; needs Metal device + moeflux artifacts"]
fn sigmoid_gate_close_c_vs_rust() {
use moeflux::riir::VARIANT;
let mut c: CBackend = open_backend();
let mut rs: RsBackend = open_backend();
let v = VARIANT;
let dim = (v.num_attn_heads * v.head_dim) as u32;
let x = synth_floats(101, dim as usize, 0.7);
let gate = synth_floats(202, dim as usize, 1.2);
let c_out = c.sigmoid_gate(dim, &gate, &x);
let rs_out = rs.sigmoid_gate(dim, &gate, &x);
assert_eq!(c_out.len(), dim as usize);
assert_eq!(rs_out.len(), dim as usize);
let max_abs_diff = c_out
.iter()
.zip(rs_out.iter())
.map(|(a, b)| (a - b).abs())
.fold(0.0f32, f32::max);
let cos = cosine_sim(&c_out, &rs_out);
let max_abs_out = c_out
.iter()
.chain(rs_out.iter())
.map(|x| x.abs())
.fold(0.0f32, f32::max);
eprintln!(
"[diff:sigmoid_gate] cosine={cos:.7} max_abs_diff={max_abs_diff:.3e} \
max_abs_out={max_abs_out:.3e}"
);
assert!(
c_out.iter().all(|x| x.is_finite()),
"[diff:sigmoid_gate] C output has NaN/Inf"
);
assert!(
rs_out.iter().all(|x| x.is_finite()),
"[diff:sigmoid_gate] Rust output has NaN/Inf"
);
let first_diff = c_out.iter().zip(rs_out.iter()).position(|(a, b)| a != b);
if let Some(i) = first_diff {
eprintln!(
"[diff:sigmoid_gate] first non-bit-exact at i={i}: \
c={} rs={}",
c_out[i], rs_out[i]
);
}
assert_eq!(
max_abs_diff, 0.0,
"[diff:sigmoid_gate] expected bit-exact, got max_abs_diff {max_abs_diff:.3e}"
);
}
#[test]
#[ignore = "long running; needs Metal device + moeflux artifacts"]
fn deferred_experts_begin_complete_close_c_vs_rust() {
use moeflux::riir::expert_forward::synth;
use moeflux::riir::VARIANT;
let mut c: CBackend = open_backend();
let mut rs: RsBackend = open_backend();
let k: usize = 4;
let expert_data = synth::k_expert_data_seeded(k);
let h_post = synth::h_post_seeded();
let h_mid = synth::h_mid_seeded();
let shared_out = synth::shared_out_seeded();
let weights = synth::expert_weights_seeded(k);
let shared_gate_score: f32 = -1.0;
assert_eq!(expert_data.len(), k * VARIANT.expert_size_4bit());
assert_eq!(weights.len(), k);
c.begin_deferred_experts(
k as i32, &expert_data, &h_post, &h_mid, &shared_out, &weights,
shared_gate_score,
);
let c_out = c.complete_deferred_experts();
rs.begin_deferred_experts(
k as i32, &expert_data, &h_post, &h_mid, &shared_out, &weights,
shared_gate_score,
);
let rs_out = rs.complete_deferred_experts();
assert_eq!(c_out.len(), VARIANT.hidden_dim);
assert_eq!(rs_out.len(), VARIANT.hidden_dim);
let cos = cosine_sim(&c_out, &rs_out);
let max_abs_out = c_out
.iter()
.chain(rs_out.iter())
.map(|x| x.abs())
.fold(0.0f32, f32::max);
let max_abs_diff = c_out
.iter()
.zip(rs_out.iter())
.map(|(a, b)| (a - b).abs())
.fold(0.0f32, f32::max);
let rel = if max_abs_out > 0.0 {
max_abs_diff / max_abs_out
} else {
0.0
};
eprintln!(
"[diff:deferred_experts_begin_complete k={k}] cosine={cos:.7} \
max_abs_diff={max_abs_diff:.3e} max_abs_out={max_abs_out:.3e} \
relative={rel:.3e}"
);
assert!(
c_out.iter().all(|x| x.is_finite()),
"[diff:deferred_experts_begin_complete] C output has NaN/Inf"
);
assert!(
rs_out.iter().all(|x| x.is_finite()),
"[diff:deferred_experts_begin_complete] Rust output has NaN/Inf"
);
assert!(
c_out.iter().any(|&x| x != 0.0),
"[diff:deferred_experts_begin_complete] C output is all zero"
);
assert!(
rs_out.iter().any(|&x| x != 0.0),
"[diff:deferred_experts_begin_complete] Rust output is all zero"
);
const COSINE_FLOOR: f32 = 0.9999;
const REL_DIFF_FLOOR: f32 = 1e-3;
assert!(
cos >= COSINE_FLOOR,
"[diff:deferred_experts_begin_complete] cosine {cos:.7} below {COSINE_FLOOR}"
);
assert!(
rel <= REL_DIFF_FLOOR,
"[diff:deferred_experts_begin_complete] relative max_abs_diff \
{rel:.3e} above {REL_DIFF_FLOOR:.3e}"
);
}
#[test]
#[ignore = "long running; needs Metal device + moeflux artifacts"]
fn deferred_experts_discard_clears_state_c_vs_rust() {
use moeflux::riir::expert_forward::synth;
use moeflux::riir::VARIANT;
let mut c: CBackend = open_backend();
let mut rs: RsBackend = open_backend();
let k1: usize = 2;
let data1 = synth::k_expert_data_seeded(k1);
let h_post1 = synth::h_post_seeded();
let h_mid1 = synth::h_mid_seeded();
let shared_out1 = synth::shared_out_seeded();
let weights1 = synth::expert_weights_seeded(k1);
let k2: usize = 4;
let data2 = synth::k_expert_data_seeded(k2);
let h_post2 = synth::h_post_seeded();
let h_mid2 = synth::h_mid_seeded();
let shared_out2 = synth::shared_out_seeded();
let weights2 = synth::expert_weights_seeded(k2);
c.begin_deferred_experts(
k1 as i32, &data1, &h_post1, &h_mid1, &shared_out1, &weights1, -1.0,
);
c.discard_deferred_experts();
c.begin_deferred_experts(
k2 as i32, &data2, &h_post2, &h_mid2, &shared_out2, &weights2, -1.0,
);
let c_out = c.complete_deferred_experts();
rs.begin_deferred_experts(
k1 as i32, &data1, &h_post1, &h_mid1, &shared_out1, &weights1, -1.0,
);
rs.discard_deferred_experts();
rs.begin_deferred_experts(
k2 as i32, &data2, &h_post2, &h_mid2, &shared_out2, &weights2, -1.0,
);
let rs_out = rs.complete_deferred_experts();
assert_eq!(c_out.len(), VARIANT.hidden_dim);
assert_eq!(rs_out.len(), VARIANT.hidden_dim);
let cos = cosine_sim(&c_out, &rs_out);
let max_abs_out = c_out
.iter()
.chain(rs_out.iter())
.map(|x| x.abs())
.fold(0.0f32, f32::max);
let max_abs_diff = c_out
.iter()
.zip(rs_out.iter())
.map(|(a, b)| (a - b).abs())
.fold(0.0f32, f32::max);
let rel = if max_abs_out > 0.0 {
max_abs_diff / max_abs_out
} else {
0.0
};
eprintln!(
"[diff:deferred_experts_discard k1={k1} k2={k2}] cosine={cos:.7} \
max_abs_diff={max_abs_diff:.3e} max_abs_out={max_abs_out:.3e} \
relative={rel:.3e}"
);
assert!(
c_out.iter().all(|x| x.is_finite()),
"[diff:deferred_experts_discard] C output has NaN/Inf"
);
assert!(
rs_out.iter().all(|x| x.is_finite()),
"[diff:deferred_experts_discard] Rust output has NaN/Inf"
);
assert!(
c_out.iter().any(|&x| x != 0.0),
"[diff:deferred_experts_discard] C output is all zero"
);
assert!(
rs_out.iter().any(|&x| x != 0.0),
"[diff:deferred_experts_discard] Rust output is all zero"
);
const COSINE_FLOOR: f32 = 0.9999;
const REL_DIFF_FLOOR: f32 = 1e-3;
assert!(
cos >= COSINE_FLOOR,
"[diff:deferred_experts_discard] cosine {cos:.7} below {COSINE_FLOOR}"
);
assert!(
rel <= REL_DIFF_FLOOR,
"[diff:deferred_experts_discard] relative max_abs_diff \
{rel:.3e} above {REL_DIFF_FLOOR:.3e}"
);
}
#[test]
#[ignore = "long running; needs moeflux artifacts"]
fn layer_forward_dump_close_c_vs_rust() {
let mut c: CBackend = open_backend();
let mut rs: RsBackend = open_backend();
let hidden_dim = moeflux::riir::VARIANT.hidden_dim;
c.memory_clear();
rs.memory_clear();
let hidden_in = c.embed(1);
assert_eq!(hidden_in.len(), hidden_dim);
let layer_idx = 0i32; let pos = 0i32;
let c_out = c.layer_forward_dump(layer_idx, pos, &hidden_in);
let rs_out = rs.layer_forward_dump(layer_idx, pos, &hidden_in);
assert_eq!(c_out.len(), hidden_dim);
assert_eq!(rs_out.len(), hidden_dim);
assert!(
c_out.iter().all(|x| x.is_finite()),
"[diff:layer_forward_dump] C output has NaN/Inf"
);
assert!(
rs_out.iter().all(|x| x.is_finite()),
"[diff:layer_forward_dump] Rust output has NaN/Inf"
);
let max_abs_out =
c_out.iter().map(|x| x.abs()).fold(0.0f32, f32::max);
assert!(
max_abs_out > 1e-6,
"[diff:layer_forward_dump] C output magnitude {max_abs_out:.3e} \
too small — production path likely no-op'd (cross-Ctx layer_cache?)"
);
let cos = cosine_sim(&c_out, &rs_out);
let max_abs_diff = c_out
.iter()
.zip(rs_out.iter())
.map(|(a, b)| (a - b).abs())
.fold(0.0f32, f32::max);
let rel = max_abs_diff / max_abs_out.max(f32::EPSILON);
eprintln!(
"[diff:layer_forward_dump layer=0] cosine={cos:.7} \
max_abs_diff={max_abs_diff:.3e} max_abs_out={max_abs_out:.3e} \
rel={rel:.3e}"
);
const COSINE_FLOOR: f32 = 0.9999;
const REL_DIFF_FLOOR: f32 = 1e-3;
assert!(
cos >= COSINE_FLOOR,
"[diff:layer_forward_dump] cosine {cos:.7} below {COSINE_FLOOR}"
);
assert!(
rel <= REL_DIFF_FLOOR,
"[diff:layer_forward_dump] relative max_abs_diff \
{rel:.3e} above {REL_DIFF_FLOOR:.3e}"
);
}
#[test]
#[ignore = "long running; needs moeflux artifacts"]
fn layer_forward_dump_close_c_vs_rust_cpu_combine() {
let mut c: CBackend = open_backend();
let mut rs: RsBackend = open_backend();
let hidden_dim = moeflux::riir::VARIANT.hidden_dim;
c.memory_clear();
rs.memory_clear();
let hidden_in = c.embed(1);
assert_eq!(hidden_in.len(), hidden_dim);
let layer_idx = 0i32;
let pos = 0i32;
let c_out = c.layer_forward_dump(layer_idx, pos, &hidden_in);
let mut rs_out = vec![0.0f32; hidden_dim];
rs.0.layer_forward_dump_with_gpu_combine(
layer_idx,
pos,
&hidden_in,
&mut rs_out,
false,
)
.expect("RsBackend layer_forward_dump_with_gpu_combine");
assert!(
c_out.iter().all(|x| x.is_finite()),
"[diff:layer_forward_dump_cpu_combine] C output has NaN/Inf"
);
assert!(
rs_out.iter().all(|x| x.is_finite()),
"[diff:layer_forward_dump_cpu_combine] Rust output has NaN/Inf"
);
let max_abs_out =
c_out.iter().map(|x| x.abs()).fold(0.0f32, f32::max);
assert!(
max_abs_out > 1e-6,
"[diff:layer_forward_dump_cpu_combine] C output magnitude \
{max_abs_out:.3e} too small"
);
let cos = cosine_sim(&c_out, &rs_out);
let max_abs_diff = c_out
.iter()
.zip(rs_out.iter())
.map(|(a, b)| (a - b).abs())
.fold(0.0f32, f32::max);
let rel = max_abs_diff / max_abs_out.max(f32::EPSILON);
eprintln!(
"[diff:layer_forward_dump_cpu_combine layer=0] cosine={cos:.7} \
max_abs_diff={max_abs_diff:.3e} max_abs_out={max_abs_out:.3e} \
rel={rel:.3e}"
);
const COSINE_FLOOR: f32 = 0.9999;
const REL_DIFF_FLOOR: f32 = 1e-3;
assert!(
cos >= COSINE_FLOOR,
"[diff:layer_forward_dump_cpu_combine] cosine {cos:.7} below \
{COSINE_FLOOR}"
);
assert!(
rel <= REL_DIFF_FLOOR,
"[diff:layer_forward_dump_cpu_combine] relative max_abs_diff \
{rel:.3e} above {REL_DIFF_FLOOR:.3e}"
);
}
#[test]
#[ignore = "long running; needs moeflux artifacts"]
fn layer_forward_dump_back_to_back_no_deferred_leak() {
let mut rs: RsBackend = open_backend();
let hidden_dim = moeflux::riir::VARIANT.hidden_dim;
let hidden_in = rs.embed(1);
assert_eq!(hidden_in.len(), hidden_dim);
let layer_idx = 0i32; let pos = 0i32;
let n_iters = 5usize;
let mut outs: Vec<Vec<f32>> = Vec::with_capacity(n_iters);
for i in 0..n_iters {
rs.memory_clear();
let out = rs.layer_forward_dump(layer_idx, pos, &hidden_in);
assert_eq!(out.len(), hidden_dim, "iter {i}: output length");
assert!(
out.iter().all(|x| x.is_finite()),
"iter {i}: output has NaN/Inf — likely stale deferred state"
);
let max_abs = out.iter().map(|x| x.abs()).fold(0.0f32, f32::max);
assert!(
max_abs > 1e-6,
"iter {i}: output magnitude {max_abs:.3e} too small — drain \
likely reading from wrong buffer or hitting AlreadyActive"
);
outs.push(out);
}
for i in 1..n_iters {
let drift_max = outs[0]
.iter()
.zip(outs[i].iter())
.map(|(a, b)| (a - b).abs())
.fold(0.0f32, f32::max);
assert_eq!(
drift_max, 0.0,
"iter 0 vs iter {i} differ by max_abs_diff={drift_max:.3e} — \
deferred-experts state leaked across calls or memory_clear \
did not reset all recurrence"
);
}
eprintln!(
"[diff:layer_forward_dump_back_to_back] {n_iters} iterations \
bit-identical (max_abs_diff=0)"
);
}
#[test]
#[ignore = "long running; needs moeflux artifacts"]
fn layer_forward_dump_close_c_vs_rust_full_attn() {
let mut c: CBackend = open_backend();
let mut rs: RsBackend = open_backend();
let hidden_dim = moeflux::riir::VARIANT.hidden_dim;
c.memory_clear();
rs.memory_clear();
let hidden_in = c.embed(1);
assert_eq!(hidden_in.len(), hidden_dim);
let layer_idx = 3i32; let pos = 0i32;
let c_out = c.layer_forward_dump(layer_idx, pos, &hidden_in);
let rs_out = rs.layer_forward_dump(layer_idx, pos, &hidden_in);
assert_eq!(c_out.len(), hidden_dim);
assert_eq!(rs_out.len(), hidden_dim);
assert!(
c_out.iter().all(|x| x.is_finite()),
"[diff:layer_forward_dump_full] C output has NaN/Inf"
);
assert!(
rs_out.iter().all(|x| x.is_finite()),
"[diff:layer_forward_dump_full] Rust output has NaN/Inf"
);
let max_abs_out =
c_out.iter().map(|x| x.abs()).fold(0.0f32, f32::max);
assert!(
max_abs_out > 1e-6,
"[diff:layer_forward_dump_full] C output magnitude {max_abs_out:.3e} \
too small — production path likely no-op'd (cross-Ctx layer_cache?)"
);
let cos = cosine_sim(&c_out, &rs_out);
let max_abs_diff = c_out
.iter()
.zip(rs_out.iter())
.map(|(a, b)| (a - b).abs())
.fold(0.0f32, f32::max);
let rel = max_abs_diff / max_abs_out.max(f32::EPSILON);
eprintln!(
"[diff:layer_forward_dump layer=3 (full-attn)] cosine={cos:.7} \
max_abs_diff={max_abs_diff:.3e} max_abs_out={max_abs_out:.3e} \
rel={rel:.3e}"
);
const COSINE_FLOOR: f32 = 0.9999;
const REL_DIFF_FLOOR: f32 = 1e-3;
assert!(
cos >= COSINE_FLOOR,
"[diff:layer_forward_dump_full] cosine {cos:.7} below {COSINE_FLOOR}"
);
assert!(
rel <= REL_DIFF_FLOOR,
"[diff:layer_forward_dump_full] relative max_abs_diff \
{rel:.3e} above {REL_DIFF_FLOOR:.3e}"
);
}
#[test]
#[ignore = "long running; needs moeflux artifacts"]
fn layer_forward_dump_close_c_vs_rust_full_attn_gpu_path() {
let mut c: CBackend = open_backend();
let mut rs: RsBackend = open_backend();
let hidden_dim = moeflux::riir::VARIANT.hidden_dim;
c.memory_clear();
rs.memory_clear();
let layer_idx = 3i32;
for pos in 0i32..32 {
let hidden_in = c.embed(1 + pos);
let _c_drain = c.layer_forward_dump(layer_idx, pos, &hidden_in);
let _rs_drain = rs.layer_forward_dump(layer_idx, pos, &hidden_in);
}
let pos = 32i32;
let hidden_in = c.embed(1 + pos);
let c_out = c.layer_forward_dump(layer_idx, pos, &hidden_in);
let rs_out = rs.layer_forward_dump(layer_idx, pos, &hidden_in);
assert_eq!(c_out.len(), hidden_dim);
assert_eq!(rs_out.len(), hidden_dim);
assert!(
c_out.iter().all(|x| x.is_finite()),
"[diff:layer_forward_dump_full_gpu] C output has NaN/Inf"
);
assert!(
rs_out.iter().all(|x| x.is_finite()),
"[diff:layer_forward_dump_full_gpu] Rust output has NaN/Inf"
);
let max_abs_out =
c_out.iter().map(|x| x.abs()).fold(0.0f32, f32::max);
assert!(
max_abs_out > 1e-6,
"[diff:layer_forward_dump_full_gpu] C output magnitude \
{max_abs_out:.3e} too small — GPU path likely no-op'd"
);
let cos = cosine_sim(&c_out, &rs_out);
let max_abs_diff = c_out
.iter()
.zip(rs_out.iter())
.map(|(a, b)| (a - b).abs())
.fold(0.0f32, f32::max);
let rel = max_abs_diff / max_abs_out.max(f32::EPSILON);
eprintln!(
"[diff:layer_forward_dump layer=3 (full-attn, GPU SDPA at kv_len=33)] \
cosine={cos:.7} max_abs_diff={max_abs_diff:.3e} \
max_abs_out={max_abs_out:.3e} rel={rel:.3e}"
);
const COSINE_FLOOR: f32 = 0.9999;
const REL_DIFF_FLOOR: f32 = 1e-3;
assert!(
cos >= COSINE_FLOOR,
"[diff:layer_forward_dump_full_gpu] cosine {cos:.7} below \
{COSINE_FLOOR}"
);
assert!(
rel <= REL_DIFF_FLOOR,
"[diff:layer_forward_dump_full_gpu] relative max_abs_diff \
{rel:.3e} above {REL_DIFF_FLOOR:.3e}"
);
}
#[test]
#[ignore = "long running; needs moeflux artifacts"]
fn layer_forward_dump_c_self_sanity() {
let mut c: CBackend = open_backend();
let hidden_dim = moeflux::riir::VARIANT.hidden_dim;
c.memory_clear();
let hidden_in = c.embed( 1);
assert_eq!(hidden_in.len(), hidden_dim);
let hidden_out = c.layer_forward_dump( 0, 0, &hidden_in);
assert_eq!(hidden_out.len(), hidden_dim);
assert!(
hidden_out.iter().all(|x| x.is_finite()),
"[diff:layer_forward_dump_c] hidden_out contains NaN/Inf"
);
eprintln!(
"[diff:layer_forward_dump_c] layer=0 hidden_dim={hidden_dim} \
first 4 in={:?} out={:?}",
&hidden_in[..4],
&hidden_out[..4],
);
}
#[test]
#[ignore = "long running; needs moeflux artifacts"]
fn memory_ops_match_c_on_empty_state() {
let mut c: CBackend = open_backend();
let mut rs: RsBackend = open_backend();
eprintln!(
"[diff:memory_ops] model={} num_layers reachable via VARIANT",
c.model_name(),
);
assert_eq!(c.memory_seq_pos_max(), rs.memory_seq_pos_max());
assert_eq!(c.memory_seq_pos_max(), 0);
c.memory_clear();
rs.memory_clear();
assert_eq!(c.memory_seq_pos_max(), rs.memory_seq_pos_max());
for (p0, p1) in [(0, -1), (5, 10), (-1, -1), (100, 50), (0, 0)] {
let c_ok = c.memory_seq_rm(p0, p1);
let rs_ok = rs.memory_seq_rm(p0, p1);
assert_eq!(
c_ok, rs_ok,
"[diff:memory_ops] memory_seq_rm({p0}, {p1}) return mismatch \
(c={c_ok} rs={rs_ok})"
);
assert_eq!(
c.memory_seq_pos_max(),
rs.memory_seq_pos_max(),
"[diff:memory_ops] pos_max mismatch after memory_seq_rm({p0}, {p1})"
);
}
eprintln!("[diff:memory_ops] structural equivalence on empty state: OK");
}
#[test]
#[ignore = "long running; needs moeflux artifacts"]
fn harness_loads() {
let mut c: CBackend = open_backend();
eprintln!(
"[diff:scaffold] model={} n_vocab={} n_ctx={} eos={}",
c.model_name(),
c.n_vocab(),
c.n_ctx(),
c.eos(),
);
let prompt: [i32; 4] = [1, 200, 600, 1100];
let logits = c.eval_prompt(&prompt, 0);
assert_eq!(logits.len(), c.n_vocab());
let arg = argmax(&logits);
let top = topk(&logits, TOPK_K);
let self_jac = jaccard(&top, &top);
let self_cos = cosine_sim(&logits, &logits);
eprintln!(
"[diff:scaffold] argmax={arg} top-{TOPK_K}={top:?} \
self-jaccard={self_jac:.4} self-cosine={self_cos:.5}"
);
assert!((self_jac - 1.0).abs() < 1e-6, "self-jaccard != 1.0");
assert!((self_cos - 1.0).abs() < 1e-4, "self-cosine != 1.0");
}
const E2E_COSINE_FLOOR: f32 = 0.9999;
fn assert_e2e_logits_close(label: &str, c_logits: &[f32], rs_logits: &[f32]) {
assert_eq!(c_logits.len(), rs_logits.len(), "[{label}] logits length");
assert!(
c_logits.iter().all(|x| x.is_finite()),
"[{label}] C logits contain NaN/Inf"
);
assert!(
rs_logits.iter().all(|x| x.is_finite()),
"[{label}] Rust logits contain NaN/Inf"
);
let c_arg = argmax(c_logits);
let rs_arg = argmax(rs_logits);
let cos = cosine_sim(c_logits, rs_logits);
let max_abs_c =
c_logits.iter().map(|x| x.abs()).fold(0.0f32, f32::max);
let max_abs_diff = c_logits
.iter()
.zip(rs_logits.iter())
.map(|(a, b)| (a - b).abs())
.fold(0.0f32, f32::max);
let rel = max_abs_diff / max_abs_c.max(f32::EPSILON);
let c_top = topk(c_logits, TOPK_K);
let rs_top = topk(rs_logits, TOPK_K);
let jac = jaccard(&c_top, &rs_top);
eprintln!(
"[diff:{label}] argmax c={c_arg} rs={rs_arg} cosine={cos:.7} \
max_abs_diff={max_abs_diff:.3e} max_abs_c={max_abs_c:.3e} \
rel={rel:.3e} top-{TOPK_K} jaccard={jac:.4}"
);
assert_eq!(
c_arg, rs_arg,
"[{label}] argmax mismatch (c={c_arg} rs={rs_arg})"
);
assert!(
cos >= E2E_COSINE_FLOOR,
"[{label}] cosine {cos:.7} below {E2E_COSINE_FLOOR}"
);
}
#[test]
#[ignore = "long running; needs moeflux artifacts"]
fn eval_token_matches_c_single_step() {
let mut c: CBackend = open_backend();
let mut rs: RsBackend = open_backend();
let prefill: [i32; 4] = [1, 200, 600, 1100];
let _c_prefill_logits = c.eval_prompt(&prefill, 0);
let _rs_prefill_logits = rs.eval_prompt(&prefill, 0);
let next_token = 7i32;
let next_pos = prefill.len();
let c_logits = c.eval_token(next_token, next_pos);
let rs_logits = rs.eval_token(next_token, next_pos);
assert_e2e_logits_close(
"eval_token_after_4tok_prefill",
&c_logits,
&rs_logits,
);
}
#[test]
#[ignore = "long running; needs moeflux artifacts"]
fn state_size_matches_c_after_prefill() {
let mut c: CBackend = open_backend();
let mut rs: RsBackend = open_backend();
let prompt: [i32; 4] = [1, 200, 600, 1100];
let _ = c.eval_prompt(&prompt, 0);
let _ = rs.eval_prompt(&prompt, 0);
let c_size = c.0.state_size();
let rs_size = rs.0.state_size();
eprintln!(
"[diff:state_size_after_4tok] c={c_size} rs={rs_size}"
);
assert_eq!(
c_size, rs_size,
"state_size mismatch: c={c_size} rs={rs_size}"
);
assert!(c_size > 32, "state_size {c_size} suspiciously small");
}
#[test]
#[ignore = "long running; needs moeflux artifacts"]
fn state_round_trip_rust() {
let mut rs_ref: RsBackend = open_backend();
let prompt: [i32; 4] = [1, 200, 600, 1100];
let next_token = 7i32;
let next_pos = prompt.len();
let _ = rs_ref.eval_prompt(&prompt, 0);
let ref_logits = rs_ref.eval_token(next_token, next_pos);
let mut rs: RsBackend = open_backend();
let _ = rs.eval_prompt(&prompt, 0);
let snap_size = rs.0.state_size();
let mut snap = vec![0u8; snap_size];
let written = rs.0.state_save(&mut snap).expect("Rust state_save");
assert_eq!(written, snap_size, "state_save wrote unexpected length");
rs.memory_clear();
rs.0.state_load(&snap).expect("Rust state_load");
let test_logits = rs.eval_token(next_token, next_pos);
assert_eq!(test_logits.len(), ref_logits.len());
let drift_max = ref_logits
.iter()
.zip(test_logits.iter())
.map(|(a, b)| (a - b).abs())
.fold(0.0f32, f32::max);
let cos = cosine_sim(&ref_logits, &test_logits);
eprintln!(
"[diff:state_round_trip_rust] snap_bytes={snap_size} \
max_abs_diff={drift_max:.3e} cosine={cos:.7}"
);
assert_eq!(
argmax(&ref_logits),
argmax(&test_logits),
"round-trip changed argmax"
);
assert!(
cos >= 0.9999,
"round-trip cosine {cos:.7} below 0.9999"
);
}
#[test]
#[ignore = "long running; needs moeflux artifacts"]
fn state_load_c_from_rust_save() {
let mut rs: RsBackend = open_backend();
let prompt: [i32; 4] = [1, 200, 600, 1100];
let next_token = 7i32;
let next_pos = prompt.len();
let _ = rs.eval_prompt(&prompt, 0);
let snap_size = rs.0.state_size();
let mut snap = vec![0u8; snap_size];
rs.0.state_save(&mut snap).expect("Rust state_save");
let rs_logits = rs.eval_token(next_token, next_pos);
let mut c: CBackend = open_backend();
c.memory_clear();
c.0.state_load(&snap).expect("C state_load(rust_snap)");
let mut c_logits = vec![0.0f32; c.0.n_vocab()];
c.0
.eval_token(next_token, next_pos, 0, &mut c_logits)
.expect("C eval_token after Rust state_load");
assert_e2e_logits_close(
"state_load_c_from_rust_save",
&c_logits,
&rs_logits,
);
}
#[test]
#[ignore = "long running; needs moeflux artifacts"]
fn state_load_rust_from_c_save() {
let mut c: CBackend = open_backend();
let prompt: [i32; 4] = [1, 200, 600, 1100];
let next_token = 7i32;
let next_pos = prompt.len();
let _ = c.eval_prompt(&prompt, 0);
let snap_size = c.0.state_size();
let mut snap = vec![0u8; snap_size];
c.0.state_save(&mut snap).expect("C state_save");
let mut c_logits = vec![0.0f32; c.0.n_vocab()];
c.0
.eval_token(next_token, next_pos, 0, &mut c_logits)
.expect("C eval_token reference");
let mut rs: RsBackend = open_backend();
rs.memory_clear();
rs.0.state_load(&snap).expect("Rust state_load(c_snap)");
let rs_logits = rs.eval_token(next_token, next_pos);
assert_e2e_logits_close(
"state_load_rust_from_c_save",
&c_logits,
&rs_logits,
);
}
#[test]
#[ignore = "long running; needs moeflux artifacts"]
fn eval_prompt_matches_c_multi_token() {
let mut c: CBackend = open_backend();
let mut rs: RsBackend = open_backend();
let prompt: [i32; 8] = [1, 2, 3, 4, 5, 6, 7, 8];
let c_logits = c.eval_prompt(&prompt, 0);
let rs_logits = rs.eval_prompt(&prompt, 0);
assert_e2e_logits_close(
"eval_prompt_8tok",
&c_logits,
&rs_logits,
);
}
#[test]
#[ignore = "long running; needs moeflux artifacts"]
fn prefetch_hit_miss_equivalence_rust() {
let prompt: [i32; 4] = [1, 200, 600, 1100];
let next_token = 7i32;
let next_pos = prompt.len();
let mut rs_normal: RsBackend = open_backend();
let _ = rs_normal.eval_prompt(&prompt, 0);
let normal_logits = rs_normal.eval_token(next_token, next_pos);
let mut rs_miss: RsBackend = open_backend();
let _ = rs_miss.eval_prompt(&prompt, 0);
rs_miss.0.clear_prefetch_predictions();
let miss_logits = rs_miss.eval_token(next_token, next_pos);
assert_eq!(normal_logits.len(), miss_logits.len());
let drift_max = normal_logits
.iter()
.zip(miss_logits.iter())
.map(|(a, b)| (a - b).abs())
.fold(0.0f32, f32::max);
let cos = cosine_sim(&normal_logits, &miss_logits);
eprintln!(
"[diff:prefetch_hit_miss_equivalence] \
max_abs_diff={drift_max:.3e} cosine={cos:.7} \
argmax(normal)={a} argmax(miss)={b}",
a = argmax(&normal_logits),
b = argmax(&miss_logits),
);
assert_eq!(
argmax(&normal_logits),
argmax(&miss_logits),
"prefetch hit and all-miss paths produced different argmax"
);
assert_eq!(
drift_max, 0.0,
"prefetch hit and all-miss paths should be bit-identical, \
got drift {drift_max:.3e}"
);
}
#[test]
#[ignore = "long running; needs moeflux artifacts"]
fn memory_clear_cancels_prefetch_no_leak() {
let prompt_a: [i32; 4] = [1, 200, 600, 1100];
let prompt_b: [i32; 4] = [2, 300, 700, 1200];
let next_token = 7i32;
let next_pos = prompt_b.len();
let mut rs_ref: RsBackend = open_backend();
let _ = rs_ref.eval_prompt(&prompt_b, 0);
let ref_logits = rs_ref.eval_token(next_token, next_pos);
let mut rs: RsBackend = open_backend();
let _ = rs.eval_prompt(&prompt_a, 0);
rs.memory_clear();
let _ = rs.eval_prompt(&prompt_b, 0);
let test_logits = rs.eval_token(next_token, next_pos);
assert_eq!(test_logits.len(), ref_logits.len());
let drift_max = ref_logits
.iter()
.zip(test_logits.iter())
.map(|(a, b)| (a - b).abs())
.fold(0.0f32, f32::max);
let cos = cosine_sim(&ref_logits, &test_logits);
eprintln!(
"[diff:memory_clear_cancels_prefetch] \
max_abs_diff={drift_max:.3e} cosine={cos:.7}"
);
assert_eq!(
argmax(&ref_logits),
argmax(&test_logits),
"memory_clear leaked prefetch state across reset"
);
assert!(
cos >= 0.9999,
"memory_clear leak: cosine {cos:.7} below 0.9999"
);
}
#[test]
#[ignore = "long running; needs moeflux artifacts"]
fn slot_reuse_race_regression_rust() {
let prompt: [i32; 4] = [1, 200, 600, 1100];
let token_t1 = 7i32;
let token_t2 = 42i32;
let pos_t1 = prompt.len();
let pos_t2 = pos_t1 + 1;
let mut rs_ref1: RsBackend = open_backend();
let _ = rs_ref1.eval_prompt(&prompt, 0);
let _ = rs_ref1.eval_token(token_t1, pos_t1);
let ref_t2 = rs_ref1.eval_token(token_t2, pos_t2);
let mut rs: RsBackend = open_backend();
let _ = rs.eval_prompt(&prompt, 0);
rs.0.clear_prefetch_predictions();
let _ = rs.eval_token(token_t1, pos_t1);
rs.0.clear_prefetch_predictions();
let test_t2 = rs.eval_token(token_t2, pos_t2);
let drift_max = ref_t2
.iter()
.zip(test_t2.iter())
.map(|(a, b)| (a - b).abs())
.fold(0.0f32, f32::max);
let cos = cosine_sim(&ref_t2, &test_t2);
eprintln!(
"[diff:slot_reuse_race_regression] \
max_abs_diff={drift_max:.3e} cosine={cos:.7}"
);
assert_eq!(
argmax(&ref_t2),
argmax(&test_t2),
"slot-reuse race: argmax changed across consecutive evals"
);
assert!(
cos >= 0.9999,
"slot-reuse race regression: cosine {cos:.7} below 0.9999"
);
}