1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
/// Different Whisper model sizes
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum ModelSize {
Tiny,
Base,
Small,
Medium,
Large,
}
/// Configuration for Whisper model architecture
#[derive(Debug, Clone)]
pub struct WhisperConfig {
/// Model size variant
pub model_size: ModelSize,
/// Vocabulary size (number of tokens)
pub vocab_size: usize,
/// Number of mel frequency bins
pub num_mel_bins: usize,
/// Model dimensionality (hidden size)
pub d_model: usize,
/// Number of encoder layers
pub encoder_layers: usize,
/// Number of decoder layers
pub decoder_layers: usize,
/// Number of attention heads
pub num_heads: usize,
/// Feed-forward network dimension
pub ffn_dim: usize,
/// Dropout probability
pub dropout: f64,
/// Maximum number of encoder positions (audio frames)
pub max_source_positions: usize,
/// Maximum number of decoder positions (text tokens)
pub max_target_positions: usize,
}
impl WhisperConfig {
/// Create configuration for Tiny model (39M parameters)
pub fn tiny() -> Self {
Self {
model_size: ModelSize::Tiny,
vocab_size: 51865,
num_mel_bins: 80,
d_model: 384,
encoder_layers: 4,
decoder_layers: 4,
num_heads: 6,
ffn_dim: 1536,
dropout: 0.0,
max_source_positions: 1500,
max_target_positions: 448,
}
}
/// Create configuration for Base model (74M parameters)
pub fn base() -> Self {
Self {
model_size: ModelSize::Base,
vocab_size: 51865,
num_mel_bins: 80,
d_model: 512,
encoder_layers: 6,
decoder_layers: 6,
num_heads: 8,
ffn_dim: 2048,
dropout: 0.0,
max_source_positions: 1500,
max_target_positions: 448,
}
}
/// Create configuration for Small model (244M parameters)
pub fn small() -> Self {
Self {
model_size: ModelSize::Small,
vocab_size: 51865,
num_mel_bins: 80,
d_model: 768,
encoder_layers: 12,
decoder_layers: 12,
num_heads: 12,
ffn_dim: 3072,
dropout: 0.0,
max_source_positions: 1500,
max_target_positions: 448,
}
}
/// Create configuration for Medium model (769M parameters)
pub fn medium() -> Self {
Self {
model_size: ModelSize::Medium,
vocab_size: 51865,
num_mel_bins: 80,
d_model: 1024,
encoder_layers: 24,
decoder_layers: 24,
num_heads: 16,
ffn_dim: 4096,
dropout: 0.0,
max_source_positions: 1500,
max_target_positions: 448,
}
}
/// Create configuration for Large model (1550M parameters)
///
/// Note: The HuggingFace whisper-large checkpoint uses 128 mel bins (vs 80
/// for smaller models) and a vocab of 51866 (51865 + 1 padding token).
pub fn large() -> Self {
Self {
model_size: ModelSize::Large,
vocab_size: 51866,
num_mel_bins: 128,
d_model: 1280,
encoder_layers: 32,
decoder_layers: 32,
num_heads: 20,
ffn_dim: 5120,
dropout: 0.0,
max_source_positions: 1500,
max_target_positions: 448,
}
}
/// Get the dimension of each attention head
pub fn head_dim(&self) -> usize {
self.d_model / self.num_heads
}
}