llms_from_scratch_rs/examples/
ch04.rs

1//!  Examples from Chapter 4
2
3use crate::Example;
4use anyhow::Result;
5
6/// # Getting logits with `DummyGPTModel`
7///
8/// #### Id
9/// 04.01
10///
11/// #### Page
12/// This example starts on page 97
13///
14/// #### CLI command
15/// ```sh
16/// # without cuda
17/// cargo run example 04.01
18///
19/// # with cuda
20/// cargo run --features cuda example 04.01
21/// ```
22pub struct EG01;
23
24impl Example for EG01 {
25    fn description(&self) -> String {
26        String::from("Getting logits with `DummyGPTModel`.")
27    }
28
29    fn page_source(&self) -> usize {
30        97_usize
31    }
32
33    fn main(&self) -> Result<()> {
34        use crate::listings::ch04::{Config, DummyGPTModel};
35        use candle_core::{DType, IndexOp, Module};
36        use candle_nn::{VarBuilder, VarMap};
37
38        let batch = addons::get_batch_for_gpts()?;
39        println!("batch: {:?}", batch.to_vec2::<u32>());
40
41        // create model
42        let varmap = VarMap::new();
43        let vb = VarBuilder::from_varmap(&varmap, DType::F32, batch.device());
44        let model = DummyGPTModel::new(Config::gpt2_124m(), vb)?;
45
46        // get logits
47        let logits = model.forward(&batch)?;
48        println!("output shape: {:?}", logits.shape());
49
50        // print first 10 next-token logits for each token of every input sequence
51        println!("logits: {:?}", logits.i((.., .., 0..10))?.to_vec3::<f32>());
52        Ok(())
53    }
54}
55
56/// # Manual computation of layer normalization
57///
58/// #### Id
59/// 04.02
60///
61/// #### Page
62/// This example starts on page 100
63///
64/// #### CLI command
65/// ```sh
66/// # without cuda
67/// cargo run example 04.02
68///
69/// # with cuda
70/// cargo run --features cuda example 04.02
71/// ```
72pub struct EG02;
73
74impl Example for EG02 {
75    fn description(&self) -> String {
76        String::from("Manual computation of layer normalization.")
77    }
78
79    fn page_source(&self) -> usize {
80        100_usize
81    }
82
83    fn main(&self) -> Result<()> {
84        use candle_core::{DType, Device, Module, Tensor, D};
85        use candle_nn::{linear_b, seq, Activation, VarBuilder, VarMap};
86
87        let dev = Device::cuda_if_available(0)?;
88        let varmap = VarMap::new();
89        let vb = VarBuilder::from_varmap(&varmap, DType::F32, &dev);
90
91        // create batch
92        let batch_example = Tensor::rand(0f32, 1f32, (2_usize, 5_usize), vb.device())?;
93
94        // create layer
95        let layer = seq()
96            .add(linear_b(5_usize, 6_usize, false, vb.pp("linear"))?)
97            .add(Activation::Relu);
98
99        // execute layer on batch
100        let out = layer.forward(&batch_example)?;
101        println!("out: {:?}", out.to_vec2::<f32>());
102
103        // calculate stats on outputs
104        let mean = out.mean_keepdim(D::Minus1)?;
105        let var = out.var_keepdim(D::Minus1)?;
106        println!("mean: {:?}", mean.to_vec2::<f32>());
107        println!("variance: {:?}", var.to_vec2::<f32>());
108
109        // layer normalization
110        let out_norm = (out.broadcast_sub(&mean)?.broadcast_div(&var.sqrt()?))?;
111        let mean = out_norm.mean_keepdim(D::Minus1)?;
112        let var = out_norm.var_keepdim(D::Minus1)?;
113        println!("normalized out: {:?}", out_norm.to_vec2::<f32>());
114        println!("mean: {:?}", mean.to_vec2::<f32>());
115        println!("variance: {:?}", var.to_vec2::<f32>());
116        Ok(())
117    }
118}
119
120/// # Example usage of `LayerNorm`
121///
122/// #### Id
123/// 04.03
124///
125/// #### Page
126/// This example starts on page 104
127///
128/// #### CLI command
129/// ```sh
130/// # without cuda
131/// cargo run example 04.03
132///
133/// # with cuda
134/// cargo run --features cuda example 04.03
135/// ```
136pub struct EG03;
137
138impl Example for EG03 {
139    fn description(&self) -> String {
140        String::from("Example usage of `LayerNorm`.")
141    }
142
143    fn page_source(&self) -> usize {
144        104_usize
145    }
146
147    fn main(&self) -> Result<()> {
148        use crate::listings::ch04::LayerNorm;
149        use candle_core::{DType, Device, Module, Tensor, D};
150        use candle_nn::{VarBuilder, VarMap};
151
152        let dev = Device::cuda_if_available(0)?;
153        let varmap = VarMap::new();
154        let vb = VarBuilder::from_varmap(&varmap, DType::F32, &dev);
155
156        // create batch
157        let batch_example = Tensor::rand(0f32, 1f32, (2_usize, 5_usize), vb.device())?;
158
159        // construct layer norm layer
160        let emb_dim = 5_usize;
161        let ln = LayerNorm::new(emb_dim, vb.pp("layer_norm"))?;
162        let out_ln = ln.forward(&batch_example)?;
163
164        // compute stats on out_ln
165        let mean = out_ln.mean_keepdim(D::Minus1)?;
166        let var = out_ln.var_keepdim(D::Minus1)?;
167        println!("mean: {:?}", mean.to_vec2::<f32>());
168        println!("variance: {:?}", var.to_vec2::<f32>());
169        Ok(())
170    }
171}
172
173/// # Example usage of `FeedForward` Module.
174///
175/// #### Id
176/// 04.04
177///
178/// #### Page
179/// This example starts on page 108
180///
181/// #### CLI command
182/// ```sh
183/// # without cuda
184/// cargo run example 04.04
185///
186/// # with cuda
187/// cargo run --features cuda example 04.04
188/// ```
189pub struct EG04;
190
191impl Example for EG04 {
192    fn description(&self) -> String {
193        String::from("Example usage of `FeedForward` Module.")
194    }
195
196    fn page_source(&self) -> usize {
197        108_usize
198    }
199
200    fn main(&self) -> Result<()> {
201        use crate::listings::ch04::{Config, FeedForward};
202        use candle_core::{DType, Device, IndexOp, Module, Tensor};
203        use candle_nn::{VarBuilder, VarMap};
204
205        let dev = Device::cuda_if_available(0)?;
206        let varmap = VarMap::new();
207        let vb = VarBuilder::from_varmap(&varmap, DType::F32, &dev);
208        let cfg = Config::gpt2_124m();
209
210        // create batch
211        let (batch_size, seq_len) = (2_usize, 3_usize);
212        let x = Tensor::rand(0f32, 1f32, (batch_size, seq_len, cfg.emb_dim), vb.device())?;
213
214        // feedforward
215        let ffn = FeedForward::new(cfg, vb.pp("ffn"))?;
216        let out = ffn.forward(&x)?;
217
218        println!("{:?}", out);
219        // first 10 hidden states of the embedding for 1st sequence, 1st token
220        println!("{:?}", out.i((0, 0, 0..10))?.to_vec1::<f32>());
221        Ok(())
222    }
223}
224
225/// # Comparison of gradients with and without shortcut connections
226///
227/// #### Id
228/// 04.05
229///
230/// #### Page
231/// This example starts on page 111
232///
233/// #### CLI command
234/// ```sh
235/// # without cuda
236/// cargo run example 04.05
237///
238/// # with cuda
239/// cargo run --features cuda example 04.05
240/// ```
241pub struct EG05;
242
243impl Example for EG05 {
244    fn description(&self) -> String {
245        String::from("Comparison of gradients with and without shortcut connections.")
246    }
247
248    fn page_source(&self) -> usize {
249        111_usize
250    }
251
252    fn main(&self) -> Result<()> {
253        use crate::listings::ch04::ExampleDeepNeuralNetwork;
254        use candle_core::{DType, Device, Tensor};
255        use candle_nn::{VarBuilder, VarMap};
256
257        let dev = Device::cuda_if_available(0)?;
258        let varmap = VarMap::new();
259        let vb = VarBuilder::from_varmap(&varmap, DType::F32, &dev);
260
261        let layer_sizes = &[3_usize, 3, 3, 3, 3, 1];
262        let sample_input = Tensor::new(&[[1_f32, 0., -1.]], vb.device())?;
263        let model_without_shortcut =
264            ExampleDeepNeuralNetwork::new(layer_sizes, false, vb.pp("model_wout_shortcut"))?;
265
266        let model_with_shortcut =
267            ExampleDeepNeuralNetwork::new(layer_sizes, true, vb.pp("model_with_shortcut"))?;
268
269        println!("model_without_shortcut gradients:");
270        addons::print_gradients(model_without_shortcut, &sample_input)?;
271        println!("model_with_shortcut gradients:");
272        addons::print_gradients(model_with_shortcut, &sample_input)?;
273        Ok(())
274    }
275}
276
277/// # Example usage of `TransformerBlock`
278///
279/// #### Id
280/// 04.06
281///
282/// #### Page
283/// This example starts on page 116
284///
285/// #### CLI command
286/// ```sh
287/// # without cuda
288/// cargo run example 04.06
289///
290/// # with cuda
291/// cargo run --features cuda example 04.06
292/// ```
293pub struct EG06;
294
295impl Example for EG06 {
296    fn description(&self) -> String {
297        String::from("Example usage of `TransformerBlock`.")
298    }
299
300    fn page_source(&self) -> usize {
301        116_usize
302    }
303
304    fn main(&self) -> Result<()> {
305        use crate::listings::ch04::{Config, TransformerBlock};
306        use candle_core::{DType, Device, IndexOp, Tensor};
307        use candle_nn::{VarBuilder, VarMap};
308
309        // construct transformer block
310        let dev = Device::cuda_if_available(0)?;
311        let varmap = VarMap::new();
312        let vb = VarBuilder::from_varmap(&varmap, DType::F32, &dev);
313        let cfg = Config::gpt2_124m();
314        let block = TransformerBlock::new(cfg, vb.pp("block"))?;
315
316        // create sample input
317        let (batch_size, num_tokens) = (2_usize, 4_usize);
318        let x = Tensor::rand(
319            0f32,
320            1f32,
321            (batch_size, num_tokens, cfg.emb_dim),
322            vb.device(),
323        )?;
324
325        // execute forward pass
326        let output = block.forward(&x)?;
327
328        println!("Input shape: {:?}", x.shape());
329        println!("Output shape: {:?}", output.shape());
330
331        // print the first 10 features of all tokens of the first input
332        println!(
333            "Output: {:?}",
334            output.i((0..1, .., 0..10))?.to_vec3::<f32>()
335        );
336        Ok(())
337    }
338}
339
340/// # Example usage of `GPTModel`
341///
342/// #### Id
343/// 04.07
344///
345/// #### Page
346/// This example starts on page 120
347///
348/// #### CLI command
349/// ```sh
350/// # without cuda
351/// cargo run example 04.07
352///
353/// # with cuda
354/// cargo run --features cuda example 04.07
355/// ```
356pub struct EG07;
357
358impl Example for EG07 {
359    fn description(&self) -> String {
360        String::from("Example usage of `GPTModel`.")
361    }
362
363    fn page_source(&self) -> usize {
364        120_usize
365    }
366
367    fn main(&self) -> Result<()> {
368        use crate::listings::ch04::{Config, GPTModel};
369        use candle_core::{DType, Error, IndexOp, ModuleT};
370        use candle_nn::{VarBuilder, VarMap};
371
372        let batch = addons::get_batch_for_gpts()?;
373        println!("batch: {:?}", batch.to_vec2::<u32>());
374
375        // create model
376        let varmap = VarMap::new();
377        let vb = VarBuilder::from_varmap(&varmap, DType::F32, batch.device());
378        let model = GPTModel::new(Config::gpt2_124m(), vb)?;
379
380        // get logits
381        let logits = model.forward_t(&batch, false)?;
382        println!("output shape: {:?}", logits.shape());
383
384        // print first 10 next-token logits for each token of every input sequence
385        println!("logits: {:?}", logits.i((.., .., 0..10))?.to_vec3::<f32>());
386
387        // get total number of params from the VarMap (todo: turn this into a util)
388        let mut total_params = 0_usize;
389        for t in varmap.all_vars().iter() {
390            total_params += t.elem_count();
391        }
392        println!("Total number of parameters: {}", total_params);
393
394        // Get token embedding and output layer shapes
395        let varmap_binding = varmap.data().lock().unwrap();
396        let tok_emb_dims = varmap_binding
397            .get("tok_emb.weight")
398            .ok_or_else(|| {
399                Error::CannotFindTensor {
400                    path: "tok_emb.weight".to_string(),
401                }
402                .bt()
403            })?
404            .dims();
405        println!("Token embedding layer shape {:?}", tok_emb_dims);
406        let out_head_dims = varmap_binding
407            .get("out_head.weight")
408            .ok_or_else(|| {
409                Error::CannotFindTensor {
410                    path: "out_head.weight".to_string(),
411                }
412                .bt()
413            })?
414            .dims();
415        println!("Output layer shape {:?}", out_head_dims);
416
417        // total number of params if weight tying with token emb and output layer shapes
418        let total_params_gpt2 = total_params - (out_head_dims[0] * out_head_dims[1]);
419        println!(
420            "Number of trainable parameters considering weight tying {}",
421            total_params_gpt2
422        );
423
424        // memory requirements
425        let total_size_bytes = total_params * 4;
426        let total_size_mb = total_size_bytes as f32 / (1024_f32 * 1024.);
427        println!("Total size of the model: {} MB", total_size_mb);
428        Ok(())
429    }
430}
431
432/// # Example usage of `generate_text_simple`
433///
434/// #### Id
435/// 04.08
436///
437/// #### Page
438/// This example starts on page 125
439///
440/// #### CLI command
441/// ```sh
442/// # without cuda
443/// cargo run example 04.08
444///
445/// # with cuda
446/// cargo run --features cuda example 04.08
447/// ```
448pub struct EG08;
449
450impl Example for EG08 {
451    fn description(&self) -> String {
452        String::from("Example usage of `generate_text_simple`.")
453    }
454
455    fn page_source(&self) -> usize {
456        125_usize
457    }
458
459    fn main(&self) -> Result<()> {
460        use crate::listings::ch04::{generate_text_simple, Config, GPTModel};
461        use candle_core::{DType, Device, Tensor};
462        use candle_nn::{VarBuilder, VarMap};
463        use tiktoken_rs::get_bpe_from_model;
464
465        // get starting context
466        let dev = Device::cuda_if_available(0)?;
467        let start_context = "Hello, I am";
468        let tokenizer = get_bpe_from_model("gpt2")?;
469        let encoded = tokenizer.encode_with_special_tokens(start_context);
470        let num_tokens = encoded.len();
471        println!("encoded: {:?}", encoded);
472        let encoded_tensor = Tensor::from_vec(encoded, (1_usize, num_tokens), &dev)?;
473        println!("encoded_tensor.shape {:?}", encoded_tensor);
474
475        // construct model
476        let varmap = VarMap::new();
477        let vb = VarBuilder::from_varmap(&varmap, DType::F32, &dev);
478        let cfg = Config::gpt2_124m();
479        let model = GPTModel::new(cfg, vb)?;
480
481        // run inference
482        let out = generate_text_simple(&model, encoded_tensor, 6_usize, cfg.context_length)?;
483        println!("Output: {:?}", out.to_vec2::<u32>());
484        println!("Output length: {}", out.dims()[1]);
485
486        // decode with tokenizer
487        let decoded_text = tokenizer.decode(out.reshape(out.dims()[1])?.to_vec1::<u32>()?);
488        println!("{:?}", decoded_text);
489        Ok(())
490    }
491}
492
493pub mod addons {
494    //! Auxiliary module for examples::ch04
495    use crate::listings::ch04::ExampleDeepNeuralNetwork;
496    use candle_core::{Device, Error, Module, Result, Tensor};
497    use tiktoken_rs::get_bpe_from_model;
498
499    /// Helper function to a sample batch of tokens to feed into GPTs.
500    pub fn get_batch_for_gpts() -> Result<Tensor> {
501        let dev = Device::cuda_if_available(0)?;
502
503        // create batch
504        let mut batch_tokens: Vec<u32> = Vec::new();
505        let tokenizer =
506            get_bpe_from_model("gpt2").map_err(|e| Error::Msg(format!("Tokenizer error: {e}")))?;
507        batch_tokens.append(&mut tokenizer.encode_with_special_tokens("Every effort moves you"));
508        batch_tokens.append(&mut tokenizer.encode_with_special_tokens("Every day holds a"));
509
510        Tensor::from_vec(batch_tokens, (2_usize, 4_usize), &dev)
511    }
512
513    /// Helper function for printing gradients of `ExampleDeepNeuralNetwork`
514    pub fn print_gradients(model: ExampleDeepNeuralNetwork, x: &Tensor) -> Result<()> {
515        use candle_nn::loss::mse;
516
517        let output = model.forward(x)?;
518        let target = Tensor::new(&[[0_f32]], x.device())?;
519
520        let loss = mse(&output, &target)?;
521        let grads = loss.backward()?;
522
523        for (ix, tensor_id) in model.tensor_ids.iter().enumerate() {
524            let grad_tensor = grads.get_id(tensor_id.to_owned()).ok_or_else(|| {
525                Error::CannotFindTensor {
526                    path: format!("{:?}", tensor_id),
527                }
528                .bt()
529            })?;
530            println!(
531                "layer.{}.weight has gradient mean of {:?}",
532                ix,
533                grad_tensor.abs()?.mean_all()?.to_scalar::<f32>()?
534            );
535        }
536        println!("\n");
537        Ok(())
538    }
539}