pub struct GemmaConfig {Show 28 fields
pub arch: GemmaArch,
pub vocab_size: usize,
pub hidden_size: usize,
pub intermediate_size: usize,
pub num_hidden_layers: usize,
pub num_attention_heads: usize,
pub num_key_value_heads: usize,
pub max_position_embeddings: usize,
pub rms_norm_eps: f64,
pub rope_theta: f64,
pub tie_word_embeddings: bool,
pub attention_bias: bool,
pub head_dim: Option<usize>,
pub attn_logit_softcapping: Option<f32>,
pub final_logit_softcapping: Option<f32>,
pub sliding_window: Option<usize>,
pub query_pre_attn_scalar: Option<f32>,
pub effective_num_layers: Option<usize>,
pub num_experts: usize,
pub num_experts_used: usize,
pub expert_ffn_size: usize,
pub expert_weights_scale: f32,
pub layer_types: Vec<GemmaLayerType>,
pub rope_parameters: GemmaRopeMap,
pub global_head_dim: Option<usize>,
pub num_global_key_value_heads: Option<usize>,
pub attention_k_eq_v: bool,
pub use_bidirectional_attention: Option<String>,
}Fields§
§arch: GemmaArch§vocab_size: usize§intermediate_size: usize§num_attention_heads: usize§num_key_value_heads: usize§max_position_embeddings: usize§rms_norm_eps: f64§rope_theta: f64§tie_word_embeddings: bool§attention_bias: bool§head_dim: Option<usize>§attn_logit_softcapping: Option<f32>§final_logit_softcapping: Option<f32>§sliding_window: Option<usize>§query_pre_attn_scalar: Option<f32>§effective_num_layers: Option<usize>§num_experts: usize§num_experts_used: usize§expert_ffn_size: usize§expert_weights_scale: f32§layer_types: Vec<GemmaLayerType>Per-layer attention kind. Empty for Gemma <=3 — fall back to
the strided pattern derived from arch.sliding_window_stride.
rope_parameters: GemmaRopeMapPer-attention-kind rope settings. Empty for Gemma <=3.
global_head_dim: Option<usize>Head dim for full-attention (global) layers. None ⇒ reuse
the base head_dim. Gemma 4 12B sets this to 512 while the
sliding head_dim stays at 256.
num_global_key_value_heads: Option<usize>Num KV heads for full-attention layers. None ⇒ reuse the
base num_key_value_heads. Gemma 4 12B sets this to 1.
attention_k_eq_v: boolWhen true (Gemma 4 12B), the K projection is reused as V at
load time — weights only ship .k_proj and .v_proj becomes
an alias.
use_bidirectional_attention: Option<String>When "vision", media placeholder spans use bidirectional
attention on sliding layers (Gemma 4 unified).
Implementations§
Source§impl GemmaConfig
impl GemmaConfig
pub fn from_file(path: &Path) -> Result<GemmaConfig, Error>
pub fn from_gguf(raw: &GgufFile) -> Result<GemmaConfig, Error>
pub fn head_dim(&self) -> usize
pub fn kv_group_size(&self) -> usize
pub fn q_proj_dim(&self) -> usize
pub fn kv_proj_dim(&self) -> usize
pub fn layer_style(&self) -> GemmaLayerStyle
pub fn active_num_layers(&self) -> usize
pub fn is_moe(&self) -> bool
Sourcepub fn use_bidirectional_vision(&self) -> bool
pub fn use_bidirectional_vision(&self) -> bool
Gemma 4 unified: bidirectional attention inside vision/audio spans.
pub fn expert_ffn_dim(&self) -> usize
pub fn attn_score_scale(&self) -> Option<f32>
Sourcepub fn layer_attn_options(
&self,
layer: usize,
) -> (MaskKind, Option<f32>, Option<f32>)
pub fn layer_attn_options( &self, layer: usize, ) -> (MaskKind, Option<f32>, Option<f32>)
Per-layer attention options driving the prefill self-attn block:
(mask kind, softmax score scale, attention logit soft-cap).
The mask varies across Gemma variants:
- Gemma 1 / no sliding window → all-causal.
- Gemma 2 → alternating sliding-window via
gemma2_layer_mask. - Gemma 3 / 4 → strided pattern via
gemma_strided_layer_mask(stride-6: every 6th layer is full causal, others are sliding-window).
Sourcepub fn is_full_attention_layer(&self, layer: usize) -> bool
pub fn is_full_attention_layer(&self, layer: usize) -> bool
Whether layer i is a full-attention (global) layer rather
than a sliding-window one. Falls back to the strided pattern
(every stride-th layer is global) when layer_types is
unset.
Sourcepub fn layer_head_dim(&self, layer: usize) -> usize
pub fn layer_head_dim(&self, layer: usize) -> usize
Per-layer head_dim. Sliding layers always use the base
head_dim; full-attention layers use global_head_dim when
set (Gemma 4 12B: 512 vs base 256).
Sourcepub fn layer_num_kv_heads(&self, layer: usize) -> usize
pub fn layer_num_kv_heads(&self, layer: usize) -> usize
Per-layer KV head count. Sliding layers use
num_key_value_heads; full-attention layers use
num_global_key_value_heads when set (Gemma 4 12B: 1 vs 8).
Sourcepub fn layer_n_rot(&self, layer: usize) -> usize
pub fn layer_n_rot(&self, layer: usize) -> usize
Number of leading per-head dimensions that get RoPE-rotated
in layer i. Returns layer_head_dim for “default” RoPE,
or floor(partial_rotary_factor * head_dim) for p-RoPE.
Sourcepub fn layer_rope_theta(&self, layer: usize) -> f64
pub fn layer_rope_theta(&self, layer: usize) -> f64
RoPE base frequency for layer i. Falls back to the
top-level rope_theta when the unified map omits the entry.
Trait Implementations§
Source§impl Clone for GemmaConfig
impl Clone for GemmaConfig
Source§fn clone(&self) -> GemmaConfig
fn clone(&self) -> GemmaConfig
1.0.0 (const: unstable) · Source§fn clone_from(&mut self, source: &Self)
fn clone_from(&mut self, source: &Self)
source. Read moreSource§impl Debug for GemmaConfig
impl Debug for GemmaConfig
Source§impl<'de> Deserialize<'de> for GemmaConfig
impl<'de> Deserialize<'de> for GemmaConfig
Source§fn deserialize<__D>(
__deserializer: __D,
) -> Result<GemmaConfig, <__D as Deserializer<'de>>::Error>where
__D: Deserializer<'de>,
fn deserialize<__D>(
__deserializer: __D,
) -> Result<GemmaConfig, <__D as Deserializer<'de>>::Error>where
__D: Deserializer<'de>,
Auto Trait Implementations§
impl Freeze for GemmaConfig
impl RefUnwindSafe for GemmaConfig
impl Send for GemmaConfig
impl Sync for GemmaConfig
impl Unpin for GemmaConfig
impl UnsafeUnpin for GemmaConfig
impl UnwindSafe for GemmaConfig
Blanket Implementations§
Source§impl<T> BorrowMut<T> for Twhere
T: ?Sized,
impl<T> BorrowMut<T> for Twhere
T: ?Sized,
Source§fn borrow_mut(&mut self) -> &mut T
fn borrow_mut(&mut self) -> &mut T
impl<ST, DT> CastableFrom<ST, Initialized, Initialized> for DT
impl<ST, DT> CastableFrom<ST, Uninit, Uninit> for DT
Source§impl<T> CloneToUninit for Twhere
T: Clone,
impl<T> CloneToUninit for Twhere
T: Clone,
impl<T> DeserializeOwned for Twhere
T: for<'de> Deserialize<'de>,
Source§impl<T> IntoEither for T
impl<T> IntoEither for T
Source§fn into_either(self, into_left: bool) -> Either<Self, Self>
fn into_either(self, into_left: bool) -> Either<Self, Self>
self into a Left variant of Either<Self, Self>
if into_left is true.
Converts self into a Right variant of Either<Self, Self>
otherwise. Read moreSource§fn into_either_with<F>(self, into_left: F) -> Either<Self, Self>
fn into_either_with<F>(self, into_left: F) -> Either<Self, Self>
self into a Left variant of Either<Self, Self>
if into_left(&self) returns true.
Converts self into a Right variant of Either<Self, Self>
otherwise. Read more