llms_from_scratch_rs/exercises/
ch04.rs

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
//! Exercises from Chapter 4

use crate::Exercise;
use anyhow::Result;

/// # Number of parameters in feed forward and attention modules
///
/// #### Id
/// 4.1
///
/// #### CLI command
/// ```sh
/// # without cuda
/// cargo run exercise 4.1
///
/// # with cuda
/// cargo run --features cuda exercise 4.1
/// ```
pub struct X1;

impl Exercise for X1 {
    fn name(&self) -> String {
        String::from("4.1")
    }

    fn title(&self) -> String {
        "Number of parameters in feed forward and attention modules".to_string()
    }

    fn statement(&self) -> String {
        let stmt = "Calculate and compare the number of parameters that are contained in the feed forward module \
        and those that are contained in the multi-head attention module.";
        stmt.to_string()
    }

    fn main(&self) -> Result<()> {
        use crate::listings::ch04::{Config, TransformerBlock};
        use candle_core::{DType, Device};
        use candle_nn::{VarBuilder, VarMap};

        // create model
        let dev = Device::cuda_if_available(0)?;
        let varmap = VarMap::new();
        let vb = VarBuilder::from_varmap(&varmap, DType::F32, &dev);
        let _ = TransformerBlock::new(Config::gpt2_124m(), vb)?;

        // Get varmap data containing all variables
        let varmap_data = varmap.data().lock().unwrap();

        // Count params for ff and mha modules
        let (mut ff_params, mut mha_params) = (0_usize, 0_usize);
        for (var_name, var) in varmap_data.iter() {
            let num_params = var.elem_count();
            if var_name.starts_with("ff.") {
                ff_params += num_params;
            } else if var_name.starts_with("mha.") {
                mha_params += num_params;
            }
        }
        println!("Ff number of parameters: {}", ff_params);
        println!("Mha number of parameters: {}", mha_params);
        Ok(())
    }
}

/// # Initializing larger GPT models
///
/// #### Id
/// 4.2
///
/// #### CLI command
/// ```sh
/// # without cuda
/// cargo run exercise 4.2
///
/// # with cuda
/// cargo run --features cuda exercise 4.2
/// ```
pub struct X2;

impl Exercise for X2 {
    fn name(&self) -> String {
        String::from("4.2")
    }

    fn title(&self) -> String {
        "Initializing larger GPT models".to_string()
    }

    fn statement(&self) -> String {
        let stmt = "We initialized a 124-million-parameter GPT model, \
        which is known as 'GPT-2 small.' Without making any code modifications \
        besides updating the configuration file, use the GPTModel class to \
        implement GPT-2 medium (using 1,024-dimensional embeddings, 24 transformer \
        blocks, 16 multi-head attention heads), GPT-2 large (1,280- dimensional \
        embeddings, 36 transformer blocks, 20 multi-head attention heads), and \
        GPT-2 XL (1,600-dimensional embeddings, 48 transformer blocks, 25 \
        multi-head attention heads). As a bonus, calculate the total number of \
        parameters in each GPT model.";
        stmt.to_string()
    }

    fn main(&self) -> Result<()> {
        use crate::listings::ch04::{Config, GPTModel};
        use candle_core::{DType, Device};
        use candle_nn::{VarBuilder, VarMap};

        let configs = &[
            ("gpt2-sm", Config::gpt2_124m()),
            ("gpt2-med", Config::gpt2_medium()),
            ("gpt2-l", Config::gpt2_large()),
            ("gpt2-xl", Config::gpt2_xlarge()),
        ];

        for (mdl_name, cfg) in configs.iter() {
            // construct model which stores the vars in the varmap
            let dev = Device::cuda_if_available(0)?;
            let varmap = VarMap::new();
            let vb = VarBuilder::from_varmap(&varmap, DType::F32, &dev);
            let _ = GPTModel::new(*cfg, vb)?;

            // compute number of params (todo build utility func for this)
            let mut total_params = 0_usize;
            for t in varmap.all_vars().iter() {
                total_params += t.elem_count();
            }
            println!("{} number of parameters: {}", mdl_name, total_params);

            // Get token embedding and output layer shapes
            let varmap_data = varmap.data().lock().unwrap();
            let tok_emb_dims = varmap_data.get("tok_emb.weight").unwrap().dims();
            println!("Token embedding layer shape {:?}", tok_emb_dims);
            let out_head_dims = varmap_data.get("out_head.weight").unwrap().dims();
            println!("Output layer shape {:?}", out_head_dims);

            // total number of params if weight tying with token emb and output layer shapes
            let total_params_gpt2 = total_params - (out_head_dims[0] * out_head_dims[1]);
            println!(
                "Number of trainable parameters considering weight tying {}",
                total_params_gpt2
            );

            // memory requirements (todo: build this out as a util)
            let total_size_bytes = total_params * 4;
            let total_size_mb = total_size_bytes as f32 / (1024_f32 * 1024.);
            println!("Total size of the model: {} MB\n", total_size_mb);
        }
        Ok(())
    }
}

/// # Using separate dropout parameters
///
/// #### Id
/// 4.3
///
/// #### CLI command
/// ```sh
/// # without cuda
/// cargo run exercise 4.3
///
/// # with cuda
/// cargo run --features cuda exercise 4.3
/// ```
pub struct X3;

impl Exercise for X3 {
    fn name(&self) -> String {
        String::from("4.3")
    }

    fn title(&self) -> String {
        "Using separate dropout parameters".to_string()
    }

    fn statement(&self) -> String {
        let stmt = "At the beginning of this chapter, we defined a global \
        `drop_rate` setting in the `GPT_CONFIG_124M` dictionary to set the \
        dropout rate in various places throughout the GPTModel architecture. \
        Change the code to specify a separate dropout value for the various \
        dropout layers throughout the model architecture. (Hint: there are three \
        distinct places where we used dropout layers: the embedding layer, \
        shortcut layer, and multi-head attention module.)";
        stmt.to_string()
    }

    fn main(&self) -> Result<()> {
        use crate::listings::ch04::GPTModel;
        use candle_core::{DType, Device, IndexOp, ModuleT, Tensor};
        use candle_nn::{VarBuilder, VarMap};

        // create model
        let dev = Device::cuda_if_available(0)?;
        let varmap = VarMap::new();
        let vb = VarBuilder::from_varmap(&varmap, DType::F32, &dev);
        let model = GPTModel::new_v2(addons::ConfigV2::gpt_config_124m(), vb)?;

        // create batch inputs
        let batch = Tensor::new(&[[101_u32, 366, 100, 345], [101, 110, 322, 57]], &dev)?;

        // run model forward
        let logits = model.forward_t(&batch, false)?;

        // print first ten logits of vocabular for all batch inputs, and tokens
        let (_b, c, _vocab_size) = logits.dims3()?;
        let last_tokens_logits = logits.i((.., c - 1, ..))?;
        println!(
            "first 10 logits of last vector: {:?}",
            last_tokens_logits.i((.., 0..10))?.to_vec2::<f32>()
        );
        Ok(())
    }
}

pub mod addons {
    //! Auxiliary module for exercises::ch04
    use crate::{
        candle_addons::seqt,
        listings::{
            ch03::MultiHeadAttention,
            ch04::{FeedForward, GPTModel, LayerNorm, TransformerBlock, GELU},
        },
    };
    use candle_core::Result;
    use candle_nn::{embedding, linear_b, seq, Dropout, VarBuilder};

    /// A second `Config` variation for Exercise 4.3 to specify individual drop rates
    #[derive(Debug, Clone, Copy)]
    pub struct ConfigV2 {
        pub vocab_size: usize,
        pub context_length: usize,
        pub emb_dim: usize,
        pub n_heads: usize,
        pub n_layers: usize,
        pub drop_rate_attn: f32,
        pub drop_rate_emb: f32,
        pub drop_rate_shortcut: f32,
        pub qkv_bias: bool,
    }

    impl ConfigV2 {
        pub fn gpt_config_124m() -> Self {
            Self {
                vocab_size: 50_257,
                context_length: 1_024,
                emb_dim: 768,
                n_heads: 12,
                n_layers: 12,
                drop_rate_attn: 0.1,
                drop_rate_emb: 0.1,
                drop_rate_shortcut: 0.1,
                qkv_bias: false,
            }
        }
    }

    /// New `FeedForward` constructor using `ConfigV2`
    impl FeedForward {
        fn new_v2(cfg: ConfigV2, vb: VarBuilder<'_>) -> Result<Self> {
            let layers = seq()
                .add(linear_b(
                    cfg.emb_dim,
                    4_usize * cfg.emb_dim,
                    true,
                    vb.pp("first_layer"),
                )?)
                .add(GELU) // you should use Activation::Gelu in actual builds
                .add(linear_b(
                    4_usize * cfg.emb_dim,
                    cfg.emb_dim,
                    true,
                    vb.pp("second_layer"),
                )?);
            FeedForward::from_fields(layers)
        }
    }

    /// New `TransformerBlock` constructor using `ConfigV2`
    impl TransformerBlock {
        fn new_v2(cfg: ConfigV2, vb: VarBuilder<'_>) -> Result<Self> {
            let att = MultiHeadAttention::new(
                cfg.emb_dim,
                cfg.emb_dim,
                cfg.drop_rate_attn,
                cfg.n_heads,
                cfg.qkv_bias,
                vb.pp("mha"),
            )?;
            let ff = FeedForward::new_v2(cfg, vb.pp("ff"))?;
            let norm1 = LayerNorm::new(cfg.emb_dim, vb.pp("norm1"))?;
            let norm2 = LayerNorm::new(cfg.emb_dim, vb.pp("norm2"))?;
            let drop_shortcut = Dropout::new(cfg.drop_rate_shortcut);
            TransformerBlock::from_fields(att, ff, norm1, norm2, drop_shortcut)
        }
    }

    /// New `GPTModel` constructor using `ConfigV2`
    impl GPTModel {
        pub fn new_v2(cfg: ConfigV2, vb: VarBuilder<'_>) -> Result<Self> {
            let tok_emb = embedding(cfg.vocab_size, cfg.emb_dim, vb.pp("tok_emb"))?;
            let pos_emb = embedding(cfg.context_length, cfg.emb_dim, vb.pp("pos_emb"))?;
            let drop_emb = Dropout::new(cfg.drop_rate_emb);
            let mut trf_blocks = seqt();
            for ix in 0..cfg.n_layers {
                trf_blocks =
                    trf_blocks.add(TransformerBlock::new_v2(cfg, vb.pp(format!("trf-{}", ix)))?);
            }
            let final_norm = LayerNorm::new(cfg.emb_dim, vb.pp("final_norm"))?;
            let out_head = linear_b(cfg.emb_dim, cfg.vocab_size, false, vb.pp("out_head"))?;
            GPTModel::from_fields(tok_emb, pos_emb, drop_emb, trf_blocks, final_norm, out_head)
        }
    }
}