llms_from_scratch_rs/exercises/
ch04.rs1use crate::Exercise;
4use anyhow::Result;
5
6pub struct X1;
20
21impl Exercise for X1 {
22 fn name(&self) -> String {
23 String::from("4.1")
24 }
25
26 fn title(&self) -> String {
27 "Number of parameters in feed forward and attention modules".to_string()
28 }
29
30 fn statement(&self) -> String {
31 let stmt = "Calculate and compare the number of parameters that are contained in the feed forward module \
32 and those that are contained in the multi-head attention module.";
33 stmt.to_string()
34 }
35
36 fn main(&self) -> Result<()> {
37 use crate::listings::ch04::{Config, TransformerBlock};
38 use candle_core::{DType, Device};
39 use candle_nn::{VarBuilder, VarMap};
40
41 let dev = Device::cuda_if_available(0)?;
43 let varmap = VarMap::new();
44 let vb = VarBuilder::from_varmap(&varmap, DType::F32, &dev);
45 let _ = TransformerBlock::new(Config::gpt2_124m(), vb)?;
46
47 let varmap_data = varmap.data().lock().unwrap();
49
50 let (mut ff_params, mut mha_params) = (0_usize, 0_usize);
52 for (var_name, var) in varmap_data.iter() {
53 let num_params = var.elem_count();
54 if var_name.starts_with("ff.") {
55 ff_params += num_params;
56 } else if var_name.starts_with("mha.") {
57 mha_params += num_params;
58 }
59 }
60 println!("Ff number of parameters: {}", ff_params);
61 println!("Mha number of parameters: {}", mha_params);
62 Ok(())
63 }
64}
65
66pub struct X2;
80
81impl Exercise for X2 {
82 fn name(&self) -> String {
83 String::from("4.2")
84 }
85
86 fn title(&self) -> String {
87 "Initializing larger GPT models".to_string()
88 }
89
90 fn statement(&self) -> String {
91 let stmt = "We initialized a 124-million-parameter GPT model, \
92 which is known as 'GPT-2 small.' Without making any code modifications \
93 besides updating the configuration file, use the GPTModel class to \
94 implement GPT-2 medium (using 1,024-dimensional embeddings, 24 transformer \
95 blocks, 16 multi-head attention heads), GPT-2 large (1,280- dimensional \
96 embeddings, 36 transformer blocks, 20 multi-head attention heads), and \
97 GPT-2 XL (1,600-dimensional embeddings, 48 transformer blocks, 25 \
98 multi-head attention heads). As a bonus, calculate the total number of \
99 parameters in each GPT model.";
100 stmt.to_string()
101 }
102
103 fn main(&self) -> Result<()> {
104 use crate::listings::ch04::{Config, GPTModel};
105 use candle_core::{DType, Device};
106 use candle_nn::{VarBuilder, VarMap};
107
108 let configs = &[
109 ("gpt2-sm", Config::gpt2_124m()),
110 ("gpt2-med", Config::gpt2_medium()),
111 ("gpt2-l", Config::gpt2_large()),
112 ("gpt2-xl", Config::gpt2_xlarge()),
113 ];
114
115 for (mdl_name, cfg) in configs.iter() {
116 let dev = Device::cuda_if_available(0)?;
118 let varmap = VarMap::new();
119 let vb = VarBuilder::from_varmap(&varmap, DType::F32, &dev);
120 let _ = GPTModel::new(*cfg, vb)?;
121
122 let mut total_params = 0_usize;
124 for t in varmap.all_vars().iter() {
125 total_params += t.elem_count();
126 }
127 println!("{} number of parameters: {}", mdl_name, total_params);
128
129 let varmap_data = varmap.data().lock().unwrap();
131 let tok_emb_dims = varmap_data.get("tok_emb.weight").unwrap().dims();
132 println!("Token embedding layer shape {:?}", tok_emb_dims);
133 let out_head_dims = varmap_data.get("out_head.weight").unwrap().dims();
134 println!("Output layer shape {:?}", out_head_dims);
135
136 let total_params_gpt2 = total_params - (out_head_dims[0] * out_head_dims[1]);
138 println!(
139 "Number of trainable parameters considering weight tying {}",
140 total_params_gpt2
141 );
142
143 let total_size_bytes = total_params * 4;
145 let total_size_mb = total_size_bytes as f32 / (1024_f32 * 1024.);
146 println!("Total size of the model: {} MB\n", total_size_mb);
147 }
148 Ok(())
149 }
150}
151
152pub struct X3;
166
167impl Exercise for X3 {
168 fn name(&self) -> String {
169 String::from("4.3")
170 }
171
172 fn title(&self) -> String {
173 "Using separate dropout parameters".to_string()
174 }
175
176 fn statement(&self) -> String {
177 let stmt = "At the beginning of this chapter, we defined a global \
178 `drop_rate` setting in the `GPT_CONFIG_124M` dictionary to set the \
179 dropout rate in various places throughout the GPTModel architecture. \
180 Change the code to specify a separate dropout value for the various \
181 dropout layers throughout the model architecture. (Hint: there are three \
182 distinct places where we used dropout layers: the embedding layer, \
183 shortcut layer, and multi-head attention module.)";
184 stmt.to_string()
185 }
186
187 fn main(&self) -> Result<()> {
188 use crate::listings::ch04::GPTModel;
189 use candle_core::{DType, Device, IndexOp, ModuleT, Tensor};
190 use candle_nn::{VarBuilder, VarMap};
191
192 let dev = Device::cuda_if_available(0)?;
194 let varmap = VarMap::new();
195 let vb = VarBuilder::from_varmap(&varmap, DType::F32, &dev);
196 let model = GPTModel::new_v2(addons::ConfigV2::gpt_config_124m(), vb)?;
197
198 let batch = Tensor::new(&[[101_u32, 366, 100, 345], [101, 110, 322, 57]], &dev)?;
200
201 let logits = model.forward_t(&batch, false)?;
203
204 let (_b, c, _vocab_size) = logits.dims3()?;
206 let last_tokens_logits = logits.i((.., c - 1, ..))?;
207 println!(
208 "first 10 logits of last vector: {:?}",
209 last_tokens_logits.i((.., 0..10))?.to_vec2::<f32>()
210 );
211 Ok(())
212 }
213}
214
215pub mod addons {
216 use crate::listings::{
218 ch03::MultiHeadAttention,
219 ch04::{
220 seqtransformers, FFLayer, FeedForward, GPTModel, LayerNorm, TransformerBlock, GELU,
221 },
222 };
223 use candle_core::Result;
224 use candle_nn::{embedding, linear_b, Dropout, VarBuilder};
225
226 #[derive(Debug, Clone, Copy)]
228 pub struct ConfigV2 {
229 pub vocab_size: usize,
230 pub context_length: usize,
231 pub emb_dim: usize,
232 pub n_heads: usize,
233 pub n_layers: usize,
234 pub drop_rate_attn: f32,
235 pub drop_rate_emb: f32,
236 pub drop_rate_shortcut: f32,
237 pub qkv_bias: bool,
238 }
239
240 impl ConfigV2 {
241 pub fn gpt_config_124m() -> Self {
242 Self {
243 vocab_size: 50_257,
244 context_length: 1_024,
245 emb_dim: 768,
246 n_heads: 12,
247 n_layers: 12,
248 drop_rate_attn: 0.1,
249 drop_rate_emb: 0.1,
250 drop_rate_shortcut: 0.1,
251 qkv_bias: false,
252 }
253 }
254 }
255
256 impl FeedForward {
258 fn new_v2(cfg: ConfigV2, vb: VarBuilder<'_>) -> Result<Self> {
259 let layers = vec![
260 FFLayer::Linear(linear_b(
261 cfg.emb_dim,
262 4_usize * cfg.emb_dim,
263 true,
264 vb.pp("first_layer"),
265 )?),
266 FFLayer::GELU(GELU),
267 FFLayer::Linear(linear_b(
268 4_usize * cfg.emb_dim,
269 cfg.emb_dim,
270 true,
271 vb.pp("second_layer"),
272 )?),
273 ];
274
275 FeedForward::from_fields(layers)
276 }
277 }
278
279 impl TransformerBlock {
281 fn new_v2(cfg: ConfigV2, vb: VarBuilder<'_>) -> Result<Self> {
282 let att = MultiHeadAttention::new(
283 cfg.emb_dim,
284 cfg.emb_dim,
285 cfg.drop_rate_attn,
286 cfg.n_heads,
287 cfg.qkv_bias,
288 vb.pp("mha"),
289 )?;
290 let ff = FeedForward::new_v2(cfg, vb.pp("ff"))?;
291 let norm1 = LayerNorm::new(cfg.emb_dim, vb.pp("norm1"))?;
292 let norm2 = LayerNorm::new(cfg.emb_dim, vb.pp("norm2"))?;
293 let drop_shortcut = Dropout::new(cfg.drop_rate_shortcut);
294 TransformerBlock::from_fields(att, ff, norm1, norm2, drop_shortcut)
295 }
296 }
297
298 impl GPTModel {
300 pub fn new_v2(cfg: ConfigV2, vb: VarBuilder<'_>) -> Result<Self> {
301 let tok_emb = embedding(cfg.vocab_size, cfg.emb_dim, vb.pp("tok_emb"))?;
302 let pos_emb = embedding(cfg.context_length, cfg.emb_dim, vb.pp("pos_emb"))?;
303 let drop_emb = Dropout::new(cfg.drop_rate_emb);
304 let mut trf_blocks = seqtransformers();
305 for ix in 0..cfg.n_layers {
306 trf_blocks =
307 trf_blocks.add(TransformerBlock::new_v2(cfg, vb.pp(format!("trf-{}", ix)))?);
308 }
309 let final_norm = LayerNorm::new(cfg.emb_dim, vb.pp("final_norm"))?;
310 let out_head = linear_b(cfg.emb_dim, cfg.vocab_size, false, vb.pp("out_head"))?;
311 GPTModel::from_fields(tok_emb, pos_emb, drop_emb, trf_blocks, final_norm, out_head)
312 }
313 }
314}