Struct MultiHeadSelfAttention

Source

pub struct MultiHeadSelfAttention<B: Backend> { /* private fields */ }

Expand description

Scaled dot-product multi-head self-attention with optional chunked computation.

When chunk_size > 0 the query sequence is processed in windows of chunk_size rows, keeping the forward-pass peak attention memory at O(B · H · chunk_size · N) instead of O(B · H · N²), and ensuring each individual WGPU GPU dispatch remains small enough to avoid OS watchdog (TDR) timeouts.

§⚠ Training memory — chunking reduces dispatch size but NOT total tape

Burn’s forward pass builds an autodiff tape for every transformer layer before loss.backward() runs. At the forward→backward boundary all depth layers’ chunk tensors are simultaneously in GPU memory:

peak = depth × 2 × ceil(N/chunk) × B × H × chunk × N × 4 bytes
     = 12 × 2 × 39 × B × 12 × 64 × 2448 × 4   (ViT-B defaults)
     ≈ 6.56 GB × B

Chunking (small chunk_size) keeps individual GPU dispatch sizes small (preventing OS watchdog / TDR timeouts), but the cumulative tape size is the same as full attention. The only way to reduce training memory is gradient checkpointing (recompute attention during backward instead of storing it) — not yet implemented in this codebase.

Safe configurations (24 GB GPU, ViT-B):

batch_size = 2 → all-layers peak ≈ 13 GB ✓
batch_size = 4 → all-layers peak ≈ 26 GB ✗ OOM

The crate::training::learner::train function guards against unsafe configurations using --vram-gb to derive the correct limit.

§Forward memory comparison (N = 2 448, H = 12, B = 8, fp32)

mode	peak fwd attn tensor	size
full (chunk=0)	(8, 12, 2448, 2448)	~18 GB
chunk=256	(8, 12, 256, 2448)	~1.9 GB
chunk=128	(8, 12, 128, 2448)	~960 MB
chunk=64	(8, 12, 64, 2448)	~480 MB

Struct MultiHeadSelfAttention Copy item path

§⚠ Training memory — chunking reduces dispatch size but NOT total tape

§Forward memory comparison (N = 2 448, H = 12, B = 8, fp32)

Implementations§

impl<B: Backend> MultiHeadSelfAttention<B>

pub fn new( d_model: usize, num_heads: usize, dropout: f64, chunk_size: usize, device: &B::Device, ) -> Self

pub fn forward(&self, x: Tensor<B, 3>) -> Tensor<B, 3>

Trait Implementations§

impl<B> AutodiffModule<B> for MultiHeadSelfAttention<B>where B: AutodiffBackend + Backend, <B as AutodiffBackend>::InnerBackend: Backend,

type InnerModule = MultiHeadSelfAttention<<B as AutodiffBackend>::InnerBackend>

fn valid(&self) -> Self::InnerModule

impl<B: Backend> Clone for MultiHeadSelfAttention<B>

fn clone(&self) -> Self

fn clone_from(&mut self, source: &Self)

impl<B: Debug + Backend> Debug for MultiHeadSelfAttention<B>

fn fmt(&self, f: &mut Formatter<'_>) -> Result

impl<B: Backend> Display for MultiHeadSelfAttention<B>

fn fmt(&self, f: &mut Formatter<'_>) -> Result

impl<B: Backend> Module<B> for MultiHeadSelfAttention<B>

type Record = MultiHeadSelfAttentionRecord<B>

fn load_record(self, record: Self::Record) -> Self

fn into_record(self) -> Self::Record

fn num_params(&self) -> usize

fn visit<Visitor: ModuleVisitor<B>>(&self, visitor: &mut Visitor)

fn map<Mapper: ModuleMapper<B>>(self, mapper: &mut Mapper) -> Self

fn collect_devices(&self, devices: Devices<B>) -> Devices<B>

fn to_device(self, device: &B::Device) -> Self

fn fork(self, device: &B::Device) -> Self

fn devices(&self) -> Vec<<B as Backend>::Device>

fn no_grad(self) -> Self

fn save_file<FR, PB>( self, file_path: PB, recorder: &FR, ) -> Result<(), RecorderError>where FR: FileRecorder<B>, PB: Into<PathBuf>,

fn load_file<FR, PB>( self, file_path: PB, recorder: &FR, device: &<B as Backend>::Device, ) -> Result<Self, RecorderError>where FR: FileRecorder<B>, PB: Into<PathBuf>,

fn quantize_weights<C>(self, quantizer: &mut Quantizer<C>) -> Selfwhere C: Calibration,

impl<B: Backend> ModuleDisplay for MultiHeadSelfAttention<B>

fn format(&self, passed_settings: DisplaySettings) -> String

fn custom_settings(&self) -> Option<DisplaySettings>

fn custom_content(&self, _content: Content) -> Option<Content>

impl<B: Backend> ModuleDisplayDefault for MultiHeadSelfAttention<B>

fn content(&self, content: Content) -> Option<Content>

fn num_params(&self) -> usize

Auto Trait Implementations§

impl<B> !Freeze for MultiHeadSelfAttention<B>

impl<B> !RefUnwindSafe for MultiHeadSelfAttention<B>

impl<B> Send for MultiHeadSelfAttention<B>

impl<B> !Sync for MultiHeadSelfAttention<B>

impl<B> Unpin for MultiHeadSelfAttention<B>where <B as Backend>::FloatTensorPrimitive<2>: Unpin, <B as Backend>::QuantizedTensorPrimitive<2>: Unpin, <B as Backend>::Device: Unpin, <B as Backend>::FloatTensorPrimitive<1>: Unpin, <B as Backend>::QuantizedTensorPrimitive<1>: Unpin,

impl<B> UnwindSafe for MultiHeadSelfAttention<B>where <B as Backend>::FloatTensorPrimitive<2>: UnwindSafe, <B as Backend>::QuantizedTensorPrimitive<2>: UnwindSafe, <B as Backend>::FloatTensorPrimitive<1>: UnwindSafe, <B as Backend>::QuantizedTensorPrimitive<1>: UnwindSafe,

Blanket Implementations§

impl<T> Any for Twhere T: 'static + ?Sized,

fn type_id(&self) -> TypeId

impl<T> Borrow<T> for Twhere T: ?Sized,

fn borrow(&self) -> &T

impl<T> BorrowMut<T> for Twhere T: ?Sized,

fn borrow_mut(&mut self) -> &mut T

impl<T> CloneToUninit for Twhere T: Clone,

unsafe fn clone_to_uninit(&self, dest: *mut u8)

impl<T> From<T> for T

fn from(t: T) -> T

impl<T> Instrument for T

fn instrument(self, span: Span) -> Instrumented<Self>

fn in_current_span(self) -> Instrumented<Self>

impl<T, U> Into<U> for Twhere U: From<T>,

fn into(self) -> U

impl<T> IntoEither for T

fn into_either(self, into_left: bool) -> Either<Self, Self>

fn into_either_with<F>(self, into_left: F) -> Either<Self, Self>where F: FnOnce(&Self) -> bool,

impl<T> Pointable for T

const ALIGN: usize

type Init = T

unsafe fn init(init: <T as Pointable>::Init) -> usize

unsafe fn deref<'a>(ptr: usize) -> &'a T

unsafe fn deref_mut<'a>(ptr: usize) -> &'a mut T

unsafe fn drop(ptr: usize)

impl<T> PolicyExt for Twhere T: ?Sized,

fn and<P, B, E>(self, other: P) -> And<T, P>where T: Policy<B, E>, P: Policy<B, E>,

fn or<P, B, E>(self, other: P) -> Or<T, P>where T: Policy<B, E>, P: Policy<B, E>,

impl<T> Same for T

type Output = T

impl<T> ToOwned for Twhere T: Clone,

type Owned = T

Struct MultiHeadSelfAttention

impl<B> AutodiffModule<B> for MultiHeadSelfAttention<B>
where B: AutodiffBackend + Backend, <B as AutodiffBackend>::InnerBackend: Backend,

fn save_file<FR, PB>( self, file_path: PB, recorder: &FR, ) -> Result<(), RecorderError>
where FR: FileRecorder<B>, PB: Into<PathBuf>,

fn load_file<FR, PB>( self, file_path: PB, recorder: &FR, device: &<B as Backend>::Device, ) -> Result<Self, RecorderError>
where FR: FileRecorder<B>, PB: Into<PathBuf>,

fn quantize_weights<C>(self, quantizer: &mut Quantizer<C>) -> Self
where C: Calibration,

impl<B> Unpin for MultiHeadSelfAttention<B>
where <B as Backend>::FloatTensorPrimitive<2>: Unpin, <B as Backend>::QuantizedTensorPrimitive<2>: Unpin, <B as Backend>::Device: Unpin, <B as Backend>::FloatTensorPrimitive<1>: Unpin, <B as Backend>::QuantizedTensorPrimitive<1>: Unpin,

impl<B> UnwindSafe for MultiHeadSelfAttention<B>
where <B as Backend>::FloatTensorPrimitive<2>: UnwindSafe, <B as Backend>::QuantizedTensorPrimitive<2>: UnwindSafe, <B as Backend>::FloatTensorPrimitive<1>: UnwindSafe, <B as Backend>::QuantizedTensorPrimitive<1>: UnwindSafe,

impl<T> Any for T
where T: 'static + ?Sized,

impl<T> Borrow<T> for T
where T: ?Sized,

impl<T> BorrowMut<T> for T
where T: ?Sized,

impl<T> CloneToUninit for T
where T: Clone,

impl<T, U> Into<U> for T
where U: From<T>,

fn into_either_with<F>(self, into_left: F) -> Either<Self, Self>
where F: FnOnce(&Self) -> bool,

impl<T> PolicyExt for T
where T: ?Sized,

fn and<P, B, E>(self, other: P) -> And<T, P>
where T: Policy<B, E>, P: Policy<B, E>,

fn or<P, B, E>(self, other: P) -> Or<T, P>
where T: Policy<B, E>, P: Policy<B, E>,

impl<T> ToOwned for T
where T: Clone,

impl<T> ToString for T
where T: Display + ?Sized,

impl<T> ToStringFallible for T
where T: Display,

impl<T, U> TryFrom<U> for T
where U: Into<T>,

impl<T, U> TryInto<U> for T
where U: TryFrom<T>,

impl<V, T> VZip<V> for T
where V: MultiLane<T>,

fn with_subscriber<S>(self, subscriber: S) -> WithDispatch<Self>
where S: Into<Dispatch>,