use irithyll::{
attention::{
default_lambda_init, AttentionConfig, AttentionLayer, AttentionMode, GatedDeltaMode,
LogLinearAttention, MultiHeadAttention,
},
generators::MqarStream,
};
fn empirical_bernstein_bound(n: usize, sample_var: f64, range: f64, delta: f64) -> f64 {
debug_assert!(n >= 2, "Bernstein bound requires n >= 2");
debug_assert!(sample_var >= 0.0, "variance must be non-negative");
debug_assert!(range > 0.0, "range must be positive");
debug_assert!(delta > 0.0 && delta < 1.0, "delta must be in (0, 1)");
let n_f = n as f64;
let log_term = (2.0 / delta).ln();
let var_term = (2.0 * sample_var * log_term / n_f).sqrt();
let range_term = 7.0 * range * log_term / (3.0 * (n_f - 1.0));
var_term + range_term
}
fn cosine_sim(a: &[f64], b: &[f64]) -> f64 {
let dot: f64 = a.iter().zip(b.iter()).map(|(x, y)| x * y).sum();
let na = a.iter().map(|x| x * x).sum::<f64>().sqrt().max(1e-12);
let nb = b.iter().map(|x| x * x).sum::<f64>().sqrt().max(1e-12);
dot / (na * nb)
}
fn sample_variance(xs: &[f64]) -> f64 {
if xs.len() < 2 {
return 0.0;
}
let n = xs.len() as f64;
let mean: f64 = xs.iter().sum::<f64>() / n;
let ss: f64 = xs.iter().map(|x| (x - mean).powi(2)).sum();
ss / (n - 1.0)
}
fn moving_average(xs: &[f64], window: usize) -> Vec<f64> {
if xs.is_empty() || window == 0 {
return xs.to_vec();
}
let mut out = Vec::with_capacity(xs.len());
for i in 0..xs.len() {
let lo = i.saturating_sub(window / 2);
let hi = (i + window / 2 + 1).min(xs.len());
let slice = &xs[lo..hi];
let mean = slice.iter().sum::<f64>() / slice.len() as f64;
out.push(mean);
}
out
}
#[test]
fn log_linear_mqar_streaming_dominates_untrained() {
const N_PAIRS: usize = 4;
const D_MODEL: usize = MqarStream::DEFAULT_D_KEY; const D_KEY: usize = 4;
const D_VALUE: usize = MqarStream::DEFAULT_D_VALUE; const MAX_LEVELS: usize = 2;
const N_EPOCHS_TRAINED: usize = 200;
const N_EPOCHS_UNTRAINED: usize = 50; const LEARNING_RATE: f64 = 0.1;
const SEED: u64 = 0x1234_5678_ABCD_EF01;
const SMOOTH_WINDOW: usize = 11; const DELTA: f64 = 0.05;
let pairs: Vec<(Vec<f64>, Vec<f64>)> = (0..N_PAIRS)
.map(|i| {
let mut k: Vec<f64> = (0..D_MODEL)
.map(|j| ((i * 13 + j * 7) as f64).sin())
.collect();
let n = k.iter().map(|x| x * x).sum::<f64>().sqrt().max(1e-12);
for x in k.iter_mut() {
*x /= n;
}
let v: Vec<f64> = (0..D_VALUE)
.map(|j| ((i * 17 + j * 11) as f64).cos() * 0.5)
.collect();
(k, v)
})
.collect();
let target_var = {
let mut sum_var = 0.0;
for d in 0..D_VALUE {
let col: Vec<f64> = pairs.iter().map(|(_, v)| v[d]).collect();
sum_var += sample_variance(&col);
}
(sum_var / D_VALUE as f64).max(1e-12)
};
fn build(lr: f64, seed: u64) -> LogLinearAttention {
let mut model = LogLinearAttention::new(
AttentionMode::GatedDeltaNet {
beta_scale: 1.0,
gate_mode_delta: GatedDeltaMode::Static,
},
D_MODEL,
D_KEY,
D_VALUE,
MAX_LEVELS,
default_lambda_init(MAX_LEVELS),
seed,
);
model.set_learning_rate(lr);
model
}
fn recall_trained(
model: &mut LogLinearAttention,
pairs: &[(Vec<f64>, Vec<f64>)],
target_var: f64,
) -> f64 {
model.reset();
for (k, v) in pairs.iter() {
let _ = model.train_one(k, v);
}
let mut total_mse = 0.0;
for (k, v) in pairs.iter() {
let pred = model.query_readonly(k);
let mse = pred
.iter()
.zip(v.iter())
.map(|(p, t)| (p - t).powi(2))
.sum::<f64>()
/ pred.len() as f64;
total_mse += mse;
}
(1.0 - total_mse / pairs.len() as f64 / target_var).max(0.0)
}
fn recall_untrained(
model: &mut LogLinearAttention,
pairs: &[(Vec<f64>, Vec<f64>)],
target_var: f64,
) -> f64 {
model.reset();
for (k, _v) in pairs.iter() {
let _ = model.forward(k);
}
let mut total_mse = 0.0;
for (k, v) in pairs.iter() {
let pred = model.query_readonly(k);
let mse = pred
.iter()
.zip(v.iter())
.map(|(p, t)| (p - t).powi(2))
.sum::<f64>()
/ pred.len() as f64;
total_mse += mse;
}
(1.0 - total_mse / pairs.len() as f64 / target_var).max(0.0)
}
let mut model_trained = build(LEARNING_RATE, SEED);
let mut traj_trained: Vec<f64> = Vec::with_capacity(N_EPOCHS_TRAINED);
for _ in 0..N_EPOCHS_TRAINED {
traj_trained.push(recall_trained(&mut model_trained, &pairs, target_var));
}
let mut model_untrained = build(LEARNING_RATE, SEED);
let mut traj_untrained: Vec<f64> = Vec::with_capacity(N_EPOCHS_UNTRAINED);
for _ in 0..N_EPOCHS_UNTRAINED {
traj_untrained.push(recall_untrained(&mut model_untrained, &pairs, target_var));
}
let trained_recall = traj_trained.iter().cloned().fold(0.0_f64, f64::max);
let untrained_recall = traj_untrained.iter().cloned().fold(0.0_f64, f64::max);
let random_recall = 0.0;
let trained_var = sample_variance(&traj_trained).min(0.25);
let bernstein_trained = empirical_bernstein_bound(N_EPOCHS_TRAINED, trained_var, 1.0, DELTA);
assert!(
trained_recall > random_recall + bernstein_trained,
"Trained LLA recall {trained_recall:.4} not significantly above random \
({random_recall:.4}) + Bernstein 95% guard ({bernstein_trained:.4}, \
n={N_EPOCHS_TRAINED}, var={trained_var:.4}). Streaming SGD failed to \
lift recall above the noise floor — check gradient direction \
(diag_log_linear_grad_check) or learning rate."
);
assert!(
trained_recall > untrained_recall,
"Trained LLA recall {trained_recall:.4} must exceed untrained-LLA \
recall {untrained_recall:.4} on streaming MQAR. Same constructor, \
same seed, same compute budget — only train_one vs forward. \
BLOCKED ON: streaming SGD on Q/K/V/λ projections is not improving \
recall over the random-init forward baseline."
);
let smoothed = moving_average(&traj_trained, SMOOTH_WINDOW);
let initial_smoothed = smoothed.first().copied().unwrap_or(0.0);
let final_smoothed = smoothed.last().copied().unwrap_or(0.0);
assert!(
final_smoothed > initial_smoothed,
"Smoothed recall trajectory does not show SGD-driven ascent: \
initial={initial_smoothed:.4}, final={final_smoothed:.4} \
(window={SMOOTH_WINDOW}). The model is not learning the task — \
either the gradient is broken or the schedule never enters a \
descent regime."
);
}
#[test]
fn log_linear_needle_mse_vs_gla() {
const D_MODEL: usize = 16;
const N_DISTRACTORS: usize = 256;
const MAX_LEVELS: usize = 16;
const SEED: u64 = 0xFACE_FEED_DEAD_BEEF;
fn rand_vec(rng: &mut u64, dim: usize) -> Vec<f64> {
(0..dim)
.map(|_| {
*rng ^= *rng << 13;
*rng ^= *rng >> 7;
*rng ^= *rng << 17;
(*rng as f64) / (u64::MAX as f64) * 2.0 - 1.0
})
.collect()
}
let lambda_init = default_lambda_init(MAX_LEVELS);
let cfg_lla = AttentionConfig {
d_model: D_MODEL,
n_heads: 2,
d_key: D_MODEL / 2,
d_value: D_MODEL / 2,
mode: AttentionMode::LogLinear {
inner: Box::new(AttentionMode::GatedDeltaNet {
beta_scale: 1.0,
gate_mode_delta: GatedDeltaMode::Static,
}),
max_levels: MAX_LEVELS,
lambda_init,
},
seed: SEED,
};
let mut layer_lla = MultiHeadAttention::new(cfg_lla);
let cfg_gla = AttentionConfig {
d_model: D_MODEL,
n_heads: 2,
d_key: D_MODEL / 2,
d_value: D_MODEL / 2,
mode: AttentionMode::GLA,
seed: SEED,
};
let mut layer_gla = MultiHeadAttention::new(cfg_gla);
let mut rng = SEED.wrapping_add(0x000D_D0DD_00DD_00DD);
let needle_key = rand_vec(&mut rng, D_MODEL);
let needle_value = rand_vec(&mut rng, D_MODEL);
let composite: Vec<f64> = needle_key
.iter()
.zip(needle_value.iter())
.map(|(k, v)| 0.5 * (k + v))
.collect();
let _ = layer_lla.forward(&composite);
let _ = layer_gla.forward(&composite);
let echo_lla = layer_lla.forward(&needle_value);
let echo_gla = layer_gla.forward(&needle_value);
for _ in 0..N_DISTRACTORS {
let dk = rand_vec(&mut rng, D_MODEL);
let dv = rand_vec(&mut rng, D_MODEL);
let dt: Vec<f64> = dk
.iter()
.zip(dv.iter())
.map(|(a, b)| 0.5 * (a + b))
.collect();
let _ = layer_lla.forward(&dt);
let _ = layer_gla.forward(&dt);
}
let q_lla = layer_lla.forward(&needle_key);
let q_gla = layer_gla.forward(&needle_key);
let mse_lla = q_lla
.iter()
.zip(echo_lla.iter())
.map(|(a, b)| (a - b).powi(2))
.sum::<f64>()
/ D_MODEL as f64;
let mse_gla = q_gla
.iter()
.zip(echo_gla.iter())
.map(|(a, b)| (a - b).powi(2))
.sum::<f64>()
/ D_MODEL as f64;
let both_saturated = mse_lla < 1e-3 && mse_gla < 1e-3;
let lla_strictly_better = mse_lla <= 0.5 * mse_gla;
assert!(
both_saturated || lla_strictly_better,
"BLOCKED ON: LLA needle-MSE does NOT satisfy the ≤0.5 × GLA claim. \
Got mse_lla={mse_lla:.5}, mse_gla={mse_gla:.5}, ratio={:.3}. \
Fails the 2× architectural advantage threshold. \
Action: dispatch LLA architecture review — check Fenwick-level \
capacity at n_distractors={N_DISTRACTORS}, max_levels={MAX_LEVELS}.",
mse_lla / mse_gla.max(1e-12)
);
let cos = cosine_sim(&q_lla, &echo_lla);
assert!(cos.is_finite(), "needle cosine similarity must be finite");
}