use serde::Deserialize;
use crate::config::rope_config::RopeScaling;
#[derive(Debug, Clone, Deserialize)]
pub struct CodePredictorConfig {
#[serde(default = "default_vocab_size")]
pub vocab_size: usize,
#[serde(default = "default_hidden_size")]
pub hidden_size: usize,
#[serde(default = "default_intermediate_size")]
pub intermediate_size: usize,
#[serde(default = "default_num_hidden_layers")]
pub num_hidden_layers: usize,
#[serde(default = "default_num_attention_heads")]
pub num_attention_heads: usize,
#[serde(default = "default_num_key_value_heads")]
pub num_key_value_heads: usize,
#[serde(default = "default_head_dim")]
pub head_dim: usize,
#[serde(default = "default_hidden_act")]
pub hidden_act: String,
#[serde(default = "default_max_position_embeddings")]
pub max_position_embeddings: usize,
#[serde(default = "default_initializer_range")]
pub initializer_range: f64,
#[serde(default = "default_rms_norm_eps")]
pub rms_norm_eps: f64,
#[serde(default = "default_rope_theta")]
pub rope_theta: f64,
pub rope_scaling: Option<RopeScaling>,
#[serde(default)]
pub attention_bias: bool,
#[serde(default)]
pub attention_dropout: f64,
#[serde(default = "default_num_code_groups")]
pub num_code_groups: usize,
#[serde(default)]
pub use_sliding_window: bool,
#[serde(default = "default_sliding_window")]
pub sliding_window: Option<usize>,
#[serde(default = "default_max_window_layers")]
pub max_window_layers: usize,
pub layer_types: Option<Vec<String>>,
pub pad_token_id: Option<usize>,
}
fn default_vocab_size() -> usize {
2048
}
fn default_hidden_size() -> usize {
1024
}
fn default_intermediate_size() -> usize {
3072
}
fn default_num_hidden_layers() -> usize {
5
}
fn default_num_attention_heads() -> usize {
16
}
fn default_num_key_value_heads() -> usize {
8
}
fn default_head_dim() -> usize {
128
}
fn default_hidden_act() -> String {
"silu".to_string()
}
fn default_max_position_embeddings() -> usize {
32768
}
fn default_initializer_range() -> f64 {
0.02
}
fn default_rms_norm_eps() -> f64 {
1e-6
}
fn default_rope_theta() -> f64 {
10000.0
}
fn default_num_code_groups() -> usize {
32
}
fn default_sliding_window() -> Option<usize> {
Some(4096)
}
fn default_max_window_layers() -> usize {
28
}
impl Default for CodePredictorConfig {
fn default() -> Self {
Self {
vocab_size: default_vocab_size(),
hidden_size: default_hidden_size(),
intermediate_size: default_intermediate_size(),
num_hidden_layers: default_num_hidden_layers(),
num_attention_heads: default_num_attention_heads(),
num_key_value_heads: default_num_key_value_heads(),
head_dim: default_head_dim(),
hidden_act: default_hidden_act(),
max_position_embeddings: default_max_position_embeddings(),
initializer_range: default_initializer_range(),
rms_norm_eps: default_rms_norm_eps(),
rope_theta: default_rope_theta(),
rope_scaling: None,
attention_bias: false,
attention_dropout: 0.0,
num_code_groups: default_num_code_groups(),
use_sliding_window: false,
sliding_window: default_sliding_window(),
max_window_layers: default_max_window_layers(),
layer_types: None,
pad_token_id: None,
}
}
}
impl CodePredictorConfig {
pub fn get_layer_types(&self) -> Vec<String> {
if let Some(ref types) = self.layer_types {
types.clone()
} else {
(0..self.num_hidden_layers)
.map(|i| {
if self.use_sliding_window && i >= self.max_window_layers {
"sliding_attention".to_string()
} else {
"full_attention".to_string()
}
})
.collect()
}
}
}
impl crate::nn::attention::config::AttentionConfig for CodePredictorConfig {
fn hidden_size(&self) -> usize {
self.hidden_size
}
fn num_attention_heads(&self) -> usize {
self.num_attention_heads
}
fn num_key_value_heads(&self) -> usize {
self.num_key_value_heads
}
fn head_dim(&self) -> usize {
self.head_dim
}
fn attention_bias(&self) -> bool {
self.attention_bias
}
fn rms_norm_eps(&self) -> f64 {
self.rms_norm_eps
}
fn sliding_window(&self) -> Option<usize> {
self.sliding_window
}
}