llms_from_scratch_rs/exercises/
ch04.rs

1//! Exercises from Chapter 4
2
3use crate::Exercise;
4use anyhow::Result;
5
6/// # Number of parameters in feed forward and attention modules
7///
8/// #### Id
9/// 4.1
10///
11/// #### CLI command
12/// ```sh
13/// # without cuda
14/// cargo run exercise 4.1
15///
16/// # with cuda
17/// cargo run --features cuda exercise 4.1
18/// ```
19pub struct X1;
20
21impl Exercise for X1 {
22    fn name(&self) -> String {
23        String::from("4.1")
24    }
25
26    fn title(&self) -> String {
27        "Number of parameters in feed forward and attention modules".to_string()
28    }
29
30    fn statement(&self) -> String {
31        let stmt = "Calculate and compare the number of parameters that are contained in the feed forward module \
32        and those that are contained in the multi-head attention module.";
33        stmt.to_string()
34    }
35
36    fn main(&self) -> Result<()> {
37        use crate::listings::ch04::{Config, TransformerBlock};
38        use candle_core::{DType, Device};
39        use candle_nn::{VarBuilder, VarMap};
40
41        // create model
42        let dev = Device::cuda_if_available(0)?;
43        let varmap = VarMap::new();
44        let vb = VarBuilder::from_varmap(&varmap, DType::F32, &dev);
45        let _ = TransformerBlock::new(Config::gpt2_124m(), vb)?;
46
47        // Get varmap data containing all variables
48        let varmap_data = varmap.data().lock().unwrap();
49
50        // Count params for ff and mha modules
51        let (mut ff_params, mut mha_params) = (0_usize, 0_usize);
52        for (var_name, var) in varmap_data.iter() {
53            let num_params = var.elem_count();
54            if var_name.starts_with("ff.") {
55                ff_params += num_params;
56            } else if var_name.starts_with("mha.") {
57                mha_params += num_params;
58            }
59        }
60        println!("Ff number of parameters: {}", ff_params);
61        println!("Mha number of parameters: {}", mha_params);
62        Ok(())
63    }
64}
65
66/// # Initializing larger GPT models
67///
68/// #### Id
69/// 4.2
70///
71/// #### CLI command
72/// ```sh
73/// # without cuda
74/// cargo run exercise 4.2
75///
76/// # with cuda
77/// cargo run --features cuda exercise 4.2
78/// ```
79pub struct X2;
80
81impl Exercise for X2 {
82    fn name(&self) -> String {
83        String::from("4.2")
84    }
85
86    fn title(&self) -> String {
87        "Initializing larger GPT models".to_string()
88    }
89
90    fn statement(&self) -> String {
91        let stmt = "We initialized a 124-million-parameter GPT model, \
92        which is known as 'GPT-2 small.' Without making any code modifications \
93        besides updating the configuration file, use the GPTModel class to \
94        implement GPT-2 medium (using 1,024-dimensional embeddings, 24 transformer \
95        blocks, 16 multi-head attention heads), GPT-2 large (1,280- dimensional \
96        embeddings, 36 transformer blocks, 20 multi-head attention heads), and \
97        GPT-2 XL (1,600-dimensional embeddings, 48 transformer blocks, 25 \
98        multi-head attention heads). As a bonus, calculate the total number of \
99        parameters in each GPT model.";
100        stmt.to_string()
101    }
102
103    fn main(&self) -> Result<()> {
104        use crate::listings::ch04::{Config, GPTModel};
105        use candle_core::{DType, Device};
106        use candle_nn::{VarBuilder, VarMap};
107
108        let configs = &[
109            ("gpt2-sm", Config::gpt2_124m()),
110            ("gpt2-med", Config::gpt2_medium()),
111            ("gpt2-l", Config::gpt2_large()),
112            ("gpt2-xl", Config::gpt2_xlarge()),
113        ];
114
115        for (mdl_name, cfg) in configs.iter() {
116            // construct model which stores the vars in the varmap
117            let dev = Device::cuda_if_available(0)?;
118            let varmap = VarMap::new();
119            let vb = VarBuilder::from_varmap(&varmap, DType::F32, &dev);
120            let _ = GPTModel::new(*cfg, vb)?;
121
122            // compute number of params (todo build utility func for this)
123            let mut total_params = 0_usize;
124            for t in varmap.all_vars().iter() {
125                total_params += t.elem_count();
126            }
127            println!("{} number of parameters: {}", mdl_name, total_params);
128
129            // Get token embedding and output layer shapes
130            let varmap_data = varmap.data().lock().unwrap();
131            let tok_emb_dims = varmap_data.get("tok_emb.weight").unwrap().dims();
132            println!("Token embedding layer shape {:?}", tok_emb_dims);
133            let out_head_dims = varmap_data.get("out_head.weight").unwrap().dims();
134            println!("Output layer shape {:?}", out_head_dims);
135
136            // total number of params if weight tying with token emb and output layer shapes
137            let total_params_gpt2 = total_params - (out_head_dims[0] * out_head_dims[1]);
138            println!(
139                "Number of trainable parameters considering weight tying {}",
140                total_params_gpt2
141            );
142
143            // memory requirements (todo: build this out as a util)
144            let total_size_bytes = total_params * 4;
145            let total_size_mb = total_size_bytes as f32 / (1024_f32 * 1024.);
146            println!("Total size of the model: {} MB\n", total_size_mb);
147        }
148        Ok(())
149    }
150}
151
152/// # Using separate dropout parameters
153///
154/// #### Id
155/// 4.3
156///
157/// #### CLI command
158/// ```sh
159/// # without cuda
160/// cargo run exercise 4.3
161///
162/// # with cuda
163/// cargo run --features cuda exercise 4.3
164/// ```
165pub struct X3;
166
167impl Exercise for X3 {
168    fn name(&self) -> String {
169        String::from("4.3")
170    }
171
172    fn title(&self) -> String {
173        "Using separate dropout parameters".to_string()
174    }
175
176    fn statement(&self) -> String {
177        let stmt = "At the beginning of this chapter, we defined a global \
178        `drop_rate` setting in the `GPT_CONFIG_124M` dictionary to set the \
179        dropout rate in various places throughout the GPTModel architecture. \
180        Change the code to specify a separate dropout value for the various \
181        dropout layers throughout the model architecture. (Hint: there are three \
182        distinct places where we used dropout layers: the embedding layer, \
183        shortcut layer, and multi-head attention module.)";
184        stmt.to_string()
185    }
186
187    fn main(&self) -> Result<()> {
188        use crate::listings::ch04::GPTModel;
189        use candle_core::{DType, Device, IndexOp, ModuleT, Tensor};
190        use candle_nn::{VarBuilder, VarMap};
191
192        // create model
193        let dev = Device::cuda_if_available(0)?;
194        let varmap = VarMap::new();
195        let vb = VarBuilder::from_varmap(&varmap, DType::F32, &dev);
196        let model = GPTModel::new_v2(addons::ConfigV2::gpt_config_124m(), vb)?;
197
198        // create batch inputs
199        let batch = Tensor::new(&[[101_u32, 366, 100, 345], [101, 110, 322, 57]], &dev)?;
200
201        // run model forward
202        let logits = model.forward_t(&batch, false)?;
203
204        // print first ten logits of vocabular for all batch inputs, and tokens
205        let (_b, c, _vocab_size) = logits.dims3()?;
206        let last_tokens_logits = logits.i((.., c - 1, ..))?;
207        println!(
208            "first 10 logits of last vector: {:?}",
209            last_tokens_logits.i((.., 0..10))?.to_vec2::<f32>()
210        );
211        Ok(())
212    }
213}
214
215pub mod addons {
216    //! Auxiliary module for exercises::ch04
217    use crate::listings::{
218        ch03::MultiHeadAttention,
219        ch04::{
220            seqtransformers, FFLayer, FeedForward, GPTModel, LayerNorm, TransformerBlock, GELU,
221        },
222    };
223    use candle_core::Result;
224    use candle_nn::{embedding, linear_b, Dropout, VarBuilder};
225
226    /// A second `Config` variation for Exercise 4.3 to specify individual drop rates
227    #[derive(Debug, Clone, Copy)]
228    pub struct ConfigV2 {
229        pub vocab_size: usize,
230        pub context_length: usize,
231        pub emb_dim: usize,
232        pub n_heads: usize,
233        pub n_layers: usize,
234        pub drop_rate_attn: f32,
235        pub drop_rate_emb: f32,
236        pub drop_rate_shortcut: f32,
237        pub qkv_bias: bool,
238    }
239
240    impl ConfigV2 {
241        pub fn gpt_config_124m() -> Self {
242            Self {
243                vocab_size: 50_257,
244                context_length: 1_024,
245                emb_dim: 768,
246                n_heads: 12,
247                n_layers: 12,
248                drop_rate_attn: 0.1,
249                drop_rate_emb: 0.1,
250                drop_rate_shortcut: 0.1,
251                qkv_bias: false,
252            }
253        }
254    }
255
256    /// New `FeedForward` constructor using `ConfigV2`
257    impl FeedForward {
258        fn new_v2(cfg: ConfigV2, vb: VarBuilder<'_>) -> Result<Self> {
259            let layers = vec![
260                FFLayer::Linear(linear_b(
261                    cfg.emb_dim,
262                    4_usize * cfg.emb_dim,
263                    true,
264                    vb.pp("first_layer"),
265                )?),
266                FFLayer::GELU(GELU),
267                FFLayer::Linear(linear_b(
268                    4_usize * cfg.emb_dim,
269                    cfg.emb_dim,
270                    true,
271                    vb.pp("second_layer"),
272                )?),
273            ];
274
275            FeedForward::from_fields(layers)
276        }
277    }
278
279    /// New `TransformerBlock` constructor using `ConfigV2`
280    impl TransformerBlock {
281        fn new_v2(cfg: ConfigV2, vb: VarBuilder<'_>) -> Result<Self> {
282            let att = MultiHeadAttention::new(
283                cfg.emb_dim,
284                cfg.emb_dim,
285                cfg.drop_rate_attn,
286                cfg.n_heads,
287                cfg.qkv_bias,
288                vb.pp("mha"),
289            )?;
290            let ff = FeedForward::new_v2(cfg, vb.pp("ff"))?;
291            let norm1 = LayerNorm::new(cfg.emb_dim, vb.pp("norm1"))?;
292            let norm2 = LayerNorm::new(cfg.emb_dim, vb.pp("norm2"))?;
293            let drop_shortcut = Dropout::new(cfg.drop_rate_shortcut);
294            TransformerBlock::from_fields(att, ff, norm1, norm2, drop_shortcut)
295        }
296    }
297
298    /// New `GPTModel` constructor using `ConfigV2`
299    impl GPTModel {
300        pub fn new_v2(cfg: ConfigV2, vb: VarBuilder<'_>) -> Result<Self> {
301            let tok_emb = embedding(cfg.vocab_size, cfg.emb_dim, vb.pp("tok_emb"))?;
302            let pos_emb = embedding(cfg.context_length, cfg.emb_dim, vb.pp("pos_emb"))?;
303            let drop_emb = Dropout::new(cfg.drop_rate_emb);
304            let mut trf_blocks = seqtransformers();
305            for ix in 0..cfg.n_layers {
306                trf_blocks =
307                    trf_blocks.add(TransformerBlock::new_v2(cfg, vb.pp(format!("trf-{}", ix)))?);
308            }
309            let final_norm = LayerNorm::new(cfg.emb_dim, vb.pp("final_norm"))?;
310            let out_head = linear_b(cfg.emb_dim, cfg.vocab_size, false, vb.pp("out_head"))?;
311            GPTModel::from_fields(tok_emb, pos_emb, drop_emb, trf_blocks, final_norm, out_head)
312        }
313    }
314}