Skip to main content

Crate attnres

Crate attnres 

Source
Expand description

§attnres

First Rust implementation of Attention Residuals from the MoonshotAI/Kimi paper, built on the burn deep learning framework.

Attention Residuals replace standard fixed-weight residual connections in Transformers with learned softmax attention over depth, enabling selective information routing across layers.

§Quick Start

use attnres::{AttnResConfig, AttnResTransformer};
use burn::prelude::*;
use burn::backend::NdArray;

type B = NdArray;

let device = Default::default();
let config = AttnResConfig::new(128, 8, 2)
    .with_num_heads(4)
    .with_vocab_size(1000);

let model: AttnResTransformer<B> = config.init_model(&device);
let input_ids = Tensor::<B, 2, Int>::zeros([1, 16], &device);
let logits = model.forward(input_ids, None);
assert_eq!(logits.dims(), [1, 16, 1000]);

Re-exports§

pub use attention::MultiHeadAttention;
pub use attention::MultiHeadAttentionConfig;
pub use attn_res_op::AttnResOp;
pub use block_state::BlockState;
pub use config::AttnResConfig;
pub use feed_forward::FeedForward;
pub use feed_forward::FeedForwardConfig;
pub use layer::AttnResLayer;
pub use model::AttnResTransformer;
pub use rms_norm::RmsNorm;
pub use rms_norm::RmsNormConfig;
pub use serialization::SerializationError;
pub use utils::causal_mask;

Modules§

attention
attn_res_op
block_state
config
feed_forward
layer
model
rms_norm
serialization
two_phase
utils