use serde::{Deserialize, Serialize};
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct AudioConfig {
pub encoder_type: String,
pub hidden_size: usize,
pub num_layers: usize,
pub num_heads: usize,
#[serde(default = "default_num_mel_bins")]
pub num_mel_bins: usize,
#[serde(default = "default_max_audio_len")]
pub max_audio_len: usize,
#[serde(default = "default_audio_projector_type")]
pub projector_type: String,
#[serde(default = "default_vocab_size")]
pub vocab_size: usize,
#[serde(default)]
pub decoder_layers: Option<usize>,
#[serde(default = "default_max_target_positions")]
pub max_target_positions: usize,
#[serde(default)]
pub intermediate_size: Option<usize>,
}
impl AudioConfig {
pub fn decoder_layer_count(&self) -> usize {
self.decoder_layers.unwrap_or(self.num_layers)
}
pub fn ffn_dim(&self) -> usize {
self.intermediate_size.unwrap_or(self.hidden_size * 4)
}
}
fn default_vocab_size() -> usize {
51865
}
fn default_max_target_positions() -> usize {
448
}
fn default_num_mel_bins() -> usize {
128
}
fn default_max_audio_len() -> usize {
3000
}
fn default_audio_projector_type() -> String {
"linear".to_string()
}