Struct TrainingConfig

Source

pub struct TrainingConfig {Show 23 fields
    pub model_size: ModelSize,
    pub total_steps: usize,
    pub batch_size: usize,
    pub lr: f64,
    pub weight_decay: f64,
    pub beta2: f64,
    pub beta1: f64,
    pub epsilon: f64,
    pub grad_clip_norm: f64,
    pub warmup_fraction: f64,
    pub cooldown_fraction: f64,
    pub lr_schedule: LrScheduleType,
    pub checkpoint_every: usize,
    pub log_every: usize,
    pub seed: u64,
    pub caption_key: CaptionKey,
    pub tokenizer_path: String,
    pub artifact_dir: String,
    pub data_dir: String,
    pub num_workers: usize,
    pub vram_gb: Option<f64>,
    pub skip_vram_check: bool,
    pub show_summary: bool,
}

Expand description

Training hyperparameters.

Fields§

§model_size: ModelSize

Model size preset (tiny / small / base).

Overrides SensorLMConfig when passed through the CLI. Building a config from a preset is the recommended way to avoid mismatched d_model / embed_dim values between the two towers.

§total_steps: usize

Total number of gradient update steps.

§batch_size: usize

Mini-batch size (default: 8).

§lr: f64

Peak learning rate (default: 5 × 10⁻⁴).

§weight_decay: f64

AdamW weight decay (default: 1 × 10⁻⁴).

§beta2: f64

Adam β₂ (default: 0.999, reference uses scale_by_adam b2=0.999).

§beta1: f64

Adam β₁.

§epsilon: f64

Adam ε.

§grad_clip_norm: f64

Gradient clip norm (default: 1.0).

§warmup_fraction: f64

Fraction of total steps used for linear warm-up (default: 0.2).

§cooldown_fraction: f64

Fraction of total steps used for cool-down (default: 0.2).

§lr_schedule: LrScheduleType

LR schedule type.

§checkpoint_every: usize

Save a checkpoint every N steps.

§log_every: usize

Log metrics every N steps.

§seed: u64

Random seed.

§caption_key: CaptionKey

Caption type key to use during this training run.

§tokenizer_path: String

Path to SentencePiece tokeniser model file.

§artifact_dir: String

Directory to write checkpoints / logs.

§data_dir: String

Directory containing the dataset (Parquet or raw files).

§num_workers: usize

Number of DataLoader worker threads for CPU-side data preparation.

Must be ≥ 1 — Burn’s PartialDataset::split divides the dataset length by num_workers, so 0 causes a divide-by-zero panic.

The WGPU backend (including Metal on macOS) is internally thread-safe: worker threads can call Tensor::from_floats(…, &device) safely. 2 workers is a reasonable default; increase on machines with many CPU cores and fast NVMe storage. Use 1 if you observe data-loading becoming the training bottleneck (rare with synthetic data).

§vram_gb: Option<f64>

Available GPU VRAM in gigabytes.

When set the pre-flight guard derives the attention-tensor budget as vram_gb / 3 and auto-caps batch_size to the largest value that fits, so you never have to tune --batch-size manually.

Memory split used (all figures are estimates):

┌─────────────────────────────────────────────────────┐
│  1/3 → attention score/weight tensors (one layer)  │
│  1/3 → model weights + gradients + Adam states     │
│  1/3 → non-attention activations + OS/driver slack │
└─────────────────────────────────────────────────────┘

Examples for ViT-B (base), depth=12, H=12, chunk=64, N=2448:

The peak memory is depth × per-layer because Burn’s forward pass builds autodiff tape for all transformer layers before backward() starts. 70% of VRAM is budgeted for this tape; the rest covers model weights + gradients + Adam states + other activations.

VRAM	attn budget (×0.7)	max batch	all-layers peak
8 GB	5.6 GB	1	6.6 GB
16 GB	11.2 GB	1	6.6 GB
24 GB	16.8 GB	2	13.1 GB
32 GB	22.4 GB	3	19.7 GB
48 GB	33.6 GB	5	32.8 GB
80 GB	56.0 GB	8	52.4 GB

§skip_vram_check: bool

Skip the pre-flight VRAM safety check and proceed even if the estimated attention memory exceeds the computed limit.

Use this only when you are certain your GPU has enough free VRAM. You accept full responsibility for OOM errors or GPU driver crashes.

§show_summary: bool

Print Burn’s ═══ Learner Summary ═══ table after training.

Disabled by default to keep the terminal output clean. Pass --summary on the CLI to enable.

TrainingConfig

Struct TrainingConfig Copy item path

Fields§

Trait Implementations§

impl Clone for TrainingConfig

fn clone(&self) -> TrainingConfig

fn clone_from(&mut self, source: &Self)

impl Debug for TrainingConfig

fn fmt(&self, f: &mut Formatter<'_>) -> Result

impl Default for TrainingConfig

fn default() -> Self

impl<'de> Deserialize<'de> for TrainingConfig

fn deserialize<__D>(__deserializer: __D) -> Result<Self, __D::Error>where __D: Deserializer<'de>,

impl Serialize for TrainingConfig

fn serialize<__S>(&self, __serializer: __S) -> Result<__S::Ok, __S::Error>where __S: Serializer,

Auto Trait Implementations§

impl Freeze for TrainingConfig

impl RefUnwindSafe for TrainingConfig

impl Send for TrainingConfig

impl Sync for TrainingConfig

impl Unpin for TrainingConfig

impl UnsafeUnpin for TrainingConfig

impl UnwindSafe for TrainingConfig

Blanket Implementations§

impl<T> Any for Twhere T: 'static + ?Sized,

fn type_id(&self) -> TypeId

impl<T> Borrow<T> for Twhere T: ?Sized,

fn borrow(&self) -> &T

impl<T> BorrowMut<T> for Twhere T: ?Sized,

fn borrow_mut(&mut self) -> &mut T

impl<T> CloneToUninit for Twhere T: Clone,

unsafe fn clone_to_uninit(&self, dest: *mut u8)

impl<T> From<T> for T

fn from(t: T) -> T

impl<T> Instrument for T

fn instrument(self, span: Span) -> Instrumented<Self>

fn in_current_span(self) -> Instrumented<Self>

impl<T, U> Into<U> for Twhere U: From<T>,

fn into(self) -> U

impl<T> IntoEither for T

fn into_either(self, into_left: bool) -> Either<Self, Self>

fn into_either_with<F>(self, into_left: F) -> Either<Self, Self>where F: FnOnce(&Self) -> bool,

impl<T> Pointable for T

const ALIGN: usize

type Init = T

unsafe fn init(init: <T as Pointable>::Init) -> usize

unsafe fn deref<'a>(ptr: usize) -> &'a T

unsafe fn deref_mut<'a>(ptr: usize) -> &'a mut T

unsafe fn drop(ptr: usize)

impl<T> PolicyExt for Twhere T: ?Sized,

fn and<P, B, E>(self, other: P) -> And<T, P>where T: Policy<B, E>, P: Policy<B, E>,

fn or<P, B, E>(self, other: P) -> Or<T, P>where T: Policy<B, E>, P: Policy<B, E>,

impl<T> Same for T

type Output = T

impl<T> ToOwned for Twhere T: Clone,

type Owned = T

fn to_owned(&self) -> T

fn clone_into(&self, target: &mut T)

impl<T, U> TryFrom<U> for Twhere U: Into<T>,

type Error = Infallible

fn try_from(value: U) -> Result<T, <T as TryFrom<U>>::Error>

impl<T, U> TryInto<U> for Twhere U: TryFrom<T>,

type Error = <U as TryFrom<T>>::Error

fn try_into(self) -> Result<U, <U as TryFrom<T>>::Error>

impl<V, T> VZip<V> for Twhere V: MultiLane<T>,

fn vzip(self) -> V

impl<T> WithSubscriber for T

fn with_subscriber<S>(self, subscriber: S) -> WithDispatch<Self>where S: Into<Dispatch>,

fn with_current_subscriber(self) -> WithDispatch<Self>

impl<T> DeserializeOwned for Twhere T: for<'de> Deserialize<'de>,

Struct TrainingConfig

fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
where __D: Deserializer<'de>,

fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
where S: Serializer,

impl<T> Any for T
where T: 'static + ?Sized,

impl<T> Borrow<T> for T
where T: ?Sized,

impl<T> BorrowMut<T> for T
where T: ?Sized,

impl<T> CloneToUninit for T
where T: Clone,

impl<T, U> Into<U> for T
where U: From<T>,

fn into_either_with<F>(self, into_left: F) -> Either<Self, Self>
where F: FnOnce(&Self) -> bool,

impl<T> PolicyExt for T
where T: ?Sized,

fn and<P, B, E>(self, other: P) -> And<T, P>
where T: Policy<B, E>, P: Policy<B, E>,

fn or<P, B, E>(self, other: P) -> Or<T, P>
where T: Policy<B, E>, P: Policy<B, E>,

impl<T> ToOwned for T
where T: Clone,

impl<T, U> TryFrom<U> for T
where U: Into<T>,

impl<T, U> TryInto<U> for T
where U: TryFrom<T>,

impl<V, T> VZip<V> for T
where V: MultiLane<T>,

fn with_subscriber<S>(self, subscriber: S) -> WithDispatch<Self>
where S: Into<Dispatch>,

impl<T> DeserializeOwned for T
where T: for<'de> Deserialize<'de>,