Expand description
§attnres
First Rust implementation of Attention Residuals from the MoonshotAI/Kimi paper, built on the burn deep learning framework.
Attention Residuals replace standard fixed-weight residual connections in Transformers with learned softmax attention over depth, enabling selective information routing across layers.
§Quick Start
use attnres::{AttnResConfig, AttnResTransformer};
use burn::prelude::*;
use burn::backend::NdArray;
type B = NdArray;
let device = Default::default();
let config = AttnResConfig::new(128, 8, 2)
.with_num_heads(4)
.with_vocab_size(1000);
let model: AttnResTransformer<B> = config.init_model(&device);
let input_ids = Tensor::<B, 2, Int>::zeros([1, 16], &device);
let logits = model.forward(input_ids, None);
assert_eq!(logits.dims(), [1, 16, 1000]);Re-exports§
pub use attention::MultiHeadAttention;pub use attention::MultiHeadAttentionConfig;pub use attn_res_op::AttnResOp;pub use block_state::BlockState;pub use config::AttnResConfig;pub use feed_forward::FeedForward;pub use feed_forward::FeedForwardConfig;pub use layer::AttnResLayer;pub use model::AttnResTransformer;pub use rms_norm::RmsNorm;pub use rms_norm::RmsNormConfig;pub use serialization::SerializationError;pub use utils::causal_mask;