use std::fmt;
#[derive(Debug)]
pub enum Mamba2TaskError {
EmptyInput,
DimMismatch { expected: usize, got: usize },
TopKTooLarge { k: usize, vocab_size: usize },
InvalidConfig(String),
ForwardError(String),
InvalidNucleus(f32),
}
impl fmt::Display for Mamba2TaskError {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
match self {
Mamba2TaskError::EmptyInput => write!(f, "Mamba-2 task error: empty input"),
Mamba2TaskError::DimMismatch { expected, got } => write!(
f,
"Mamba-2 task error: dim mismatch — expected {expected}, got {got}"
),
Mamba2TaskError::TopKTooLarge { k, vocab_size } => write!(
f,
"Mamba-2 task error: top_k={k} exceeds vocab_size={vocab_size}"
),
Mamba2TaskError::InvalidConfig(msg) => {
write!(f, "Mamba-2 task error: invalid config: {msg}")
},
Mamba2TaskError::ForwardError(msg) => {
write!(f, "Mamba-2 task error: forward error: {msg}")
},
Mamba2TaskError::InvalidNucleus(p) => {
write!(
f,
"Mamba-2 task error: nucleus probability {p} out of (0, 1]"
)
},
}
}
}
impl std::error::Error for Mamba2TaskError {}
pub fn softplus(x: f64) -> f64 {
if x > 20.0 {
x
} else if x < -20.0 {
x.exp()
} else {
(1.0 + x.exp()).ln()
}
}
pub fn silu(x: f64) -> f64 {
x / (1.0 + (-x).exp())
}
pub fn rms_norm_f64(input: &[f64], eps: f64) -> Vec<f64> {
if input.is_empty() {
return Vec::new();
}
let mean_sq = input.iter().map(|x| x * x).sum::<f64>() / input.len() as f64;
let rms = (mean_sq + eps).sqrt();
input.iter().map(|x| x / rms).collect()
}
pub fn softmax(logits: &[f32]) -> Vec<f32> {
if logits.is_empty() {
return Vec::new();
}
let max_v = logits.iter().cloned().fold(f32::NEG_INFINITY, f32::max);
let exps: Vec<f32> = logits.iter().map(|&x| (x - max_v).exp()).collect();
let sum: f32 = exps.iter().sum();
if sum == 0.0 {
return exps;
}
exps.iter().map(|&e| e / sum).collect()
}
pub fn greedy_decode(logits: &[f32]) -> Option<u32> {
logits
.iter()
.enumerate()
.max_by(|(_, a), (_, b)| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal))
.map(|(i, _)| i as u32)
}
pub fn top_k_filter(logits: &[f32], k: usize) -> Result<Vec<f32>, Mamba2TaskError> {
let vocab = logits.len();
if k > vocab {
return Err(Mamba2TaskError::TopKTooLarge {
k,
vocab_size: vocab,
});
}
if k == 0 {
return Ok(vec![f32::NEG_INFINITY; vocab]);
}
let mut indexed: Vec<(usize, f32)> = logits.iter().copied().enumerate().collect();
indexed.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap_or(std::cmp::Ordering::Equal));
let threshold = indexed[k - 1].1;
Ok(logits
.iter()
.map(|&l| if l >= threshold { l } else { f32::NEG_INFINITY })
.collect())
}
pub fn top_p_filter(logits: &[f32], p: f32) -> Result<Vec<f32>, Mamba2TaskError> {
if p <= 0.0 || p > 1.0 {
return Err(Mamba2TaskError::InvalidNucleus(p));
}
let probs = softmax(logits);
let mut indexed: Vec<(usize, f32)> = probs.iter().copied().enumerate().collect();
indexed.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap_or(std::cmp::Ordering::Equal));
let mut cumsum = 0.0f32;
let mut included = vec![false; logits.len()];
for (orig_idx, prob) in &indexed {
included[*orig_idx] = true;
cumsum += prob;
if cumsum >= p {
break;
}
}
Ok(logits
.iter()
.enumerate()
.map(|(i, &l)| if included[i] { l } else { f32::NEG_INFINITY })
.collect())
}
pub fn ssd_chunk_coverage(seq_len: usize, chunk_size: usize) -> f64 {
if seq_len == 0 || chunk_size == 0 {
return 0.0;
}
let total_causal: usize = (1..=seq_len).sum(); let mut within_chunk = 0usize;
let mut pos = 0usize;
while pos < seq_len {
let end = (pos + chunk_size).min(seq_len);
let chunk_len = end - pos;
within_chunk += (1..=chunk_len).sum::<usize>();
pos = end;
}
within_chunk as f64 / total_causal as f64
}
pub fn ssm_step(
h: &[f64],
x: f64,
a_bar: &[f64],
b_bar: &[f64],
c: &[f64],
d: f64,
) -> Result<(Vec<f64>, f64), Mamba2TaskError> {
let d_state = h.len();
if a_bar.len() != d_state {
return Err(Mamba2TaskError::DimMismatch {
expected: d_state,
got: a_bar.len(),
});
}
if b_bar.len() != d_state {
return Err(Mamba2TaskError::DimMismatch {
expected: d_state,
got: b_bar.len(),
});
}
if c.len() != d_state {
return Err(Mamba2TaskError::DimMismatch {
expected: d_state,
got: c.len(),
});
}
let h_new: Vec<f64> = (0..d_state).map(|i| a_bar[i] * h[i] + b_bar[i] * x).collect();
let y: f64 = h_new.iter().zip(c.iter()).map(|(hi, ci)| ci * hi).sum::<f64>() + d * x;
Ok((h_new, y))
}
pub struct Mamba2ForCausalLMHead {
pub vocab_size: usize,
pub d_model: usize,
pub tie_embeddings: bool,
lm_weight: Vec<Vec<f32>>,
}
impl Mamba2ForCausalLMHead {
pub fn new(
d_model: usize,
vocab_size: usize,
tie_embeddings: bool,
) -> Result<Self, Mamba2TaskError> {
if d_model == 0 {
return Err(Mamba2TaskError::InvalidConfig(
"d_model must be > 0".to_string(),
));
}
if vocab_size == 0 {
return Err(Mamba2TaskError::InvalidConfig(
"vocab_size must be > 0".to_string(),
));
}
Ok(Self {
vocab_size,
d_model,
tie_embeddings,
lm_weight: vec![vec![0.0f32; d_model]; vocab_size],
})
}
pub fn compute_logits(&self, last_hidden: &[f32]) -> Result<Vec<f32>, Mamba2TaskError> {
if last_hidden.len() != self.d_model {
return Err(Mamba2TaskError::DimMismatch {
expected: self.d_model,
got: last_hidden.len(),
});
}
let logits: Vec<f32> = self
.lm_weight
.iter()
.map(|row| row.iter().zip(last_hidden.iter()).map(|(w, x)| w * x).sum())
.collect();
Ok(logits)
}
pub fn forward_greedy(&self, last_hidden: &[f32]) -> Result<u32, Mamba2TaskError> {
let logits = self.compute_logits(last_hidden)?;
greedy_decode(&logits).ok_or_else(|| Mamba2TaskError::ForwardError("argmax failed".into()))
}
}
pub struct Mamba2ForSequenceClassification {
pub num_labels: usize,
pub d_model: usize,
weight: Vec<Vec<f32>>,
bias: Vec<f32>,
}
impl Mamba2ForSequenceClassification {
pub fn new(d_model: usize, num_labels: usize) -> Result<Self, Mamba2TaskError> {
if d_model == 0 {
return Err(Mamba2TaskError::InvalidConfig(
"d_model must be > 0".to_string(),
));
}
if num_labels == 0 {
return Err(Mamba2TaskError::InvalidConfig(
"num_labels must be > 0".to_string(),
));
}
Ok(Self {
num_labels,
d_model,
weight: vec![vec![0.0f32; d_model]; num_labels],
bias: vec![0.0f32; num_labels],
})
}
pub fn forward(&self, hidden_states: &[f32]) -> Result<Vec<f32>, Mamba2TaskError> {
if hidden_states.is_empty() {
return Err(Mamba2TaskError::EmptyInput);
}
let seq_len = hidden_states.len() / self.d_model;
if seq_len == 0 {
return Err(Mamba2TaskError::EmptyInput);
}
let start = (seq_len - 1) * self.d_model;
let last = &hidden_states[start..start + self.d_model];
let logits: Vec<f32> = self
.weight
.iter()
.zip(self.bias.iter())
.map(|(row, &b)| row.iter().zip(last.iter()).map(|(w, x)| w * x).sum::<f32>() + b)
.collect();
Ok(logits)
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_mamba2_config_small_test() {
use crate::mamba2::config::Mamba2Config;
let cfg = Mamba2Config::small_test();
assert_eq!(cfg.d_model, 64);
assert_eq!(cfg.n_layer, 2);
assert!(cfg.validate());
}
#[test]
fn test_mamba2_config_2_7b() {
use crate::mamba2::config::Mamba2Config;
let cfg = Mamba2Config::mamba2_2_7b();
assert_eq!(cfg.d_model, 2560);
assert_eq!(cfg.nheads, 80);
assert_eq!(cfg.headdim, 64);
assert!(cfg.validate());
}
#[test]
fn test_mamba2_softplus_positive() {
for &x in &[-100.0_f64, -10.0, 0.0, 10.0, 100.0] {
assert!(softplus(x) > 0.0, "softplus({x}) must be positive");
}
}
#[test]
fn test_mamba2_softplus_large_identity() {
let v = softplus(50.0);
assert!((v - 50.0).abs() < 0.01, "softplus(50) ≈ 50, got {v}");
}
#[test]
fn test_mamba2_silu_zero() {
assert!(silu(0.0).abs() < 1e-10);
}
#[test]
fn test_mamba2_silu_large_positive() {
assert!((silu(100.0) - 100.0).abs() < 0.01);
}
#[test]
fn test_mamba2_rms_norm_constant() {
let x = vec![3.0f64; 8];
let out = rms_norm_f64(&x, 1e-8);
for &v in &out {
assert!(
(v - 1.0).abs() < 1e-6,
"constant must normalize to 1, got {v}"
);
}
}
#[test]
fn test_mamba2_rms_norm_pair() {
let x = vec![3.0f64, 4.0];
let out = rms_norm_f64(&x, 1e-8);
let rms = (12.5_f64 + 1e-8).sqrt();
assert!((out[0] - 3.0 / rms).abs() < 1e-8);
assert!((out[1] - 4.0 / rms).abs() < 1e-8);
}
#[test]
fn test_mamba2_softmax() {
let logits = vec![1.0f32, 2.0, 3.0];
let probs = softmax(&logits);
let sum: f32 = probs.iter().sum();
assert!((sum - 1.0).abs() < 1e-5);
assert!(probs[2] > probs[1] && probs[1] > probs[0]);
}
#[test]
fn test_mamba2_greedy_decode() {
let logits = vec![0.1f32, 0.9, 0.5];
assert_eq!(greedy_decode(&logits), Some(1u32));
}
#[test]
fn test_mamba2_top_k_filter() {
let logits = vec![1.0f32, 5.0, 3.0, 2.0];
let f = top_k_filter(&logits, 2).expect("top_k");
assert!((f[1] - 5.0).abs() < 1e-6);
assert!((f[2] - 3.0).abs() < 1e-6);
assert!(f[0].is_infinite() && f[0] < 0.0);
}
#[test]
fn test_mamba2_top_k_too_large() {
let logits = vec![1.0f32; 3];
assert!(matches!(
top_k_filter(&logits, 10),
Err(Mamba2TaskError::TopKTooLarge {
k: 10,
vocab_size: 3
})
));
}
#[test]
fn test_mamba2_top_p_filter() {
let logits = vec![0.0f32, 10.0, 0.0]; let f = top_p_filter(&logits, 0.99).expect("top_p");
assert!((f[1] - 10.0).abs() < 1e-6);
}
#[test]
fn test_mamba2_top_p_invalid_nucleus() {
let logits = vec![1.0f32; 4];
assert!(matches!(
top_p_filter(&logits, 0.0),
Err(Mamba2TaskError::InvalidNucleus(_))
));
assert!(matches!(
top_p_filter(&logits, 1.1),
Err(Mamba2TaskError::InvalidNucleus(_))
));
}
#[test]
fn test_mamba2_ssd_chunk_coverage_full() {
let cov = ssd_chunk_coverage(8, 8);
assert!(
(cov - 1.0).abs() < 1e-10,
"full chunk should give coverage 1.0, got {cov}"
);
}
#[test]
fn test_mamba2_ssd_chunk_coverage_one() {
let seq_len = 4usize;
let cov = ssd_chunk_coverage(seq_len, 1);
let total: usize = (1..=seq_len).sum();
let expected = seq_len as f64 / total as f64; assert!(
(cov - expected).abs() < 1e-10,
"expected {expected}, got {cov}"
);
}
#[test]
fn test_mamba2_ssm_step() {
let d_state = 4;
let h = vec![1.0f64; d_state];
let x = 0.5f64;
let a_bar = vec![0.9f64; d_state];
let b_bar = vec![0.1f64; d_state];
let c = vec![1.0f64; d_state];
let d = 0.0f64;
let (h_new, y) = ssm_step(&h, x, &a_bar, &b_bar, &c, d).expect("ssm_step");
assert_eq!(h_new.len(), d_state);
assert!(
(h_new[0] - 0.95).abs() < 1e-10,
"h_new[0] should be 0.95, got {}",
h_new[0]
);
assert!((y - 3.8).abs() < 1e-10, "y should be 3.8, got {y}");
}
#[test]
fn test_mamba2_ssm_step_dim_mismatch() {
let h = vec![1.0f64; 4];
let a_bar = vec![0.9f64; 3]; let b_bar = vec![0.1f64; 4];
let c = vec![1.0f64; 4];
let result = ssm_step(&h, 0.0, &a_bar, &b_bar, &c, 0.0);
assert!(matches!(result, Err(Mamba2TaskError::DimMismatch { .. })));
}
#[test]
fn test_mamba2_causal_lm_construction() {
let head = Mamba2ForCausalLMHead::new(64, 256, true);
assert!(head.is_ok());
let h = head.expect("causal lm head");
assert_eq!(h.vocab_size, 256);
assert_eq!(h.d_model, 64);
assert!(h.tie_embeddings);
}
#[test]
fn test_mamba2_causal_lm_greedy() {
let head = Mamba2ForCausalLMHead::new(4, 10, false).expect("causal lm");
let token = head.forward_greedy(&[0.0f32; 4]).expect("greedy");
assert!(token < 10u32, "token {token} must be within vocab_size=10");
}
#[test]
fn test_mamba2_causal_lm_dim_mismatch() {
let head = Mamba2ForCausalLMHead::new(8, 10, false).expect("causal lm");
assert!(matches!(
head.compute_logits(&[0.0f32; 4]),
Err(Mamba2TaskError::DimMismatch { .. })
));
}
#[test]
fn test_mamba2_seq_cls_forward() {
let head = Mamba2ForSequenceClassification::new(8, 3).expect("seq cls");
let hidden = vec![0.5f32; 24]; let logits = head.forward(&hidden).expect("forward");
assert_eq!(logits.len(), 3);
}
#[test]
fn test_mamba2_seq_cls_empty() {
let head = Mamba2ForSequenceClassification::new(4, 2).expect("seq cls");
assert!(matches!(
head.forward(&[]),
Err(Mamba2TaskError::EmptyInput)
));
}
#[test]
fn test_mamba2_error_display() {
let e1 = Mamba2TaskError::EmptyInput;
assert!(e1.to_string().contains("empty"));
let e2 = Mamba2TaskError::DimMismatch {
expected: 8,
got: 4,
};
assert!(e2.to_string().contains("8") && e2.to_string().contains("4"));
let e3 = Mamba2TaskError::TopKTooLarge {
k: 10,
vocab_size: 5,
};
assert!(e3.to_string().contains("10"));
let e4 = Mamba2TaskError::InvalidConfig("bad".to_string());
assert!(e4.to_string().contains("bad"));
}
#[test]
fn test_mamba2_lcg_varied_seq_cls() {
let mut state = 53u64;
for _ in 0..6 {
state = state.wrapping_mul(6364136223846793005).wrapping_add(1442695040888963407);
let d_model = ((state % 4) + 2) as usize * 8;
let num_labels = ((state >> 4) % 4 + 2) as usize;
let head =
Mamba2ForSequenceClassification::new(d_model, num_labels).expect("seq cls head");
let hs: Vec<f32> = (0..d_model * 2).map(|i| i as f32 * 0.01).collect();
let out = head.forward(&hs).expect("forward");
assert_eq!(out.len(), num_labels);
}
}
}