use serde::Deserialize;
#[derive(Debug, Clone, Deserialize)]
pub struct TokenizerV2DecoderConfig {
#[serde(default = "default_codebook_size")]
pub codebook_size: usize,
#[serde(default = "default_hidden_size")]
pub hidden_size: usize,
#[serde(default = "default_latent_dim")]
pub latent_dim: usize,
#[serde(default = "default_codebook_dim")]
pub codebook_dim: usize,
#[serde(default = "default_max_position_embeddings")]
pub max_position_embeddings: usize,
#[serde(default = "default_rope_theta")]
pub rope_theta: f64,
#[serde(default = "default_num_attention_heads")]
pub num_attention_heads: usize,
#[serde(default = "default_num_key_value_heads")]
pub num_key_value_heads: usize,
#[serde(default)]
pub head_dim: Option<usize>,
#[serde(default)]
pub attention_bias: bool,
#[serde(default = "default_sliding_window")]
pub sliding_window: usize,
#[serde(default = "default_intermediate_size")]
pub intermediate_size: usize,
#[serde(default = "default_hidden_act")]
pub hidden_act: String,
#[serde(default = "default_layer_scale_initial_scale")]
pub layer_scale_initial_scale: f64,
#[serde(default = "default_rms_norm_eps")]
pub rms_norm_eps: f64,
#[serde(default = "default_num_hidden_layers")]
pub num_hidden_layers: usize,
#[serde(default = "default_num_quantizers")]
pub num_quantizers: usize,
#[serde(default = "default_upsample_rates")]
pub upsample_rates: Vec<usize>,
#[serde(default = "default_upsampling_ratios")]
pub upsampling_ratios: Vec<usize>,
#[serde(default = "default_decoder_dim")]
pub decoder_dim: usize,
#[serde(default)]
pub attention_dropout: f64,
}
fn default_codebook_size() -> usize {
2048
}
fn default_hidden_size() -> usize {
1024
}
fn default_latent_dim() -> usize {
1024
}
fn default_codebook_dim() -> usize {
256
}
fn default_max_position_embeddings() -> usize {
8000
}
fn default_rope_theta() -> f64 {
10000.0
}
fn default_num_attention_heads() -> usize {
16
}
fn default_num_key_value_heads() -> usize {
16
}
fn default_sliding_window() -> usize {
72
}
fn default_intermediate_size() -> usize {
3072
}
fn default_hidden_act() -> String {
"silu".to_string()
}
fn default_layer_scale_initial_scale() -> f64 {
0.01
}
fn default_rms_norm_eps() -> f64 {
1e-5
}
fn default_num_hidden_layers() -> usize {
8
}
fn default_num_quantizers() -> usize {
16
}
fn default_upsample_rates() -> Vec<usize> {
vec![8, 5, 4, 3]
}
fn default_upsampling_ratios() -> Vec<usize> {
vec![2, 2]
}
fn default_decoder_dim() -> usize {
1536
}
impl Default for TokenizerV2DecoderConfig {
fn default() -> Self {
Self {
codebook_size: default_codebook_size(),
hidden_size: default_hidden_size(),
latent_dim: default_latent_dim(),
codebook_dim: default_codebook_dim(),
max_position_embeddings: default_max_position_embeddings(),
rope_theta: default_rope_theta(),
num_attention_heads: default_num_attention_heads(),
num_key_value_heads: default_num_key_value_heads(),
head_dim: None,
attention_bias: false,
sliding_window: default_sliding_window(),
intermediate_size: default_intermediate_size(),
hidden_act: default_hidden_act(),
layer_scale_initial_scale: default_layer_scale_initial_scale(),
rms_norm_eps: default_rms_norm_eps(),
num_hidden_layers: default_num_hidden_layers(),
num_quantizers: default_num_quantizers(),
upsample_rates: default_upsample_rates(),
upsampling_ratios: default_upsampling_ratios(),
decoder_dim: default_decoder_dim(),
attention_dropout: 0.0,
}
}
}
impl TokenizerV2DecoderConfig {
pub fn total_upsample(&self) -> usize {
self.upsample_rates.iter().product::<usize>()
* self.upsampling_ratios.iter().product::<usize>()
}
}