llms_from_scratch_rs/exercises/
ch03.rs

1//! Exercises from Chapter 3
2
3use crate::Exercise;
4use anyhow::Result;
5
6/// # Comparing `SelfAttention_v1` and `SelfAttention_v2`
7///
8/// #### Id
9/// 3.1
10///
11/// #### CLI command
12/// ```sh
13/// # without cuda
14/// cargo run exercise 3.1
15///
16/// # with cuda
17/// cargo run --features cuda exercise 3.1
18/// ```
19pub struct X1;
20
21impl Exercise for X1 {
22    fn name(&self) -> String {
23        String::from("3.1")
24    }
25
26    fn title(&self) -> String {
27        "Comparing `SelfAttention_v1` and `SelfAttention_v2`".to_string()
28    }
29
30    fn statement(&self) -> String {
31        let stmt = "Note that `nn.Linear` in `SelfAttention_v2` uses a \
32        different weight initialization scheme as `nn.Parameter(torch.rand(d_in, d_out))` \
33        used in `SelfAttention_v1`, which causes both mechanisms to produce \
34        different results. To check that both implementations, `SelfAttention_v1` \
35        and `SelfAttention_v2`, are otherwise similar, we can transfer the \
36        weight matrices from a `SelfAttention_v2` object to a `SelfAttention_v1`, \
37        such that both objects then produce the same results. Your task is to \
38        correctly assign the weights from an instance of `SelfAttention_v2` to \
39        an instance of `SelfAttention_v1`. To do this, you need to understand \
40        the relationship between the weights in both versions. (Hint: `nn.Linear` \
41        stores the weight matrix in a transposed form.) After the assignment, \
42        you should observe that both instances produce the same outputs.";
43        stmt.to_string()
44    }
45
46    fn main(&self) -> Result<()> {
47        use crate::listings::ch03::{SelfAttentionV1, SelfAttentionV2};
48        use candle_core::{DType, Device, Module, Tensor};
49        use candle_nn::{VarBuilder, VarMap};
50
51        let (d_in, d_out) = (3_usize, 5_usize);
52        let varmap = VarMap::new();
53        let dev = Device::cuda_if_available(0)?;
54        let vb = VarBuilder::from_varmap(&varmap, DType::F32, &dev);
55        let attn_v2_layer = SelfAttentionV2::new(d_in, d_out, false, vb.pp("attn_v2"))?;
56        let attn_v1_layer = SelfAttentionV1 {
57            w_query: attn_v2_layer.w_query().weight().t()?,
58            w_key: attn_v2_layer.w_key().weight().t()?,
59            w_value: attn_v2_layer.w_value().weight().t()?,
60            scaling: 1. / (attn_v2_layer.w_key().weight().dims()[0] as f64).sqrt(),
61        };
62
63        let input_length = 10_usize;
64        let xs = Tensor::rand(0f32, 1f32, (input_length, d_in), &dev)?;
65        let context_vectors_from_v1 = attn_v1_layer.forward(&xs)?;
66        let context_vectors_from_v2 = attn_v2_layer.forward(&xs)?;
67
68        println!(
69            "Context vectors from SelfAttention V1 and V2 are equal when using same weights: {}",
70            context_vectors_from_v1.to_vec2::<f32>()?
71                == context_vectors_from_v2.to_vec2::<f32>()?
72        );
73        Ok(())
74    }
75}
76
77/// # Returning two-dimensional embedding vectors
78///
79/// #### Id
80/// 3.2
81///
82/// #### CLI command
83/// ```sh
84/// # without cuda
85/// cargo run exercise 3.2
86///
87/// # with cuda
88/// cargo run --features cuda exercise 3.2
89/// ```
90pub struct X2;
91
92impl Exercise for X2 {
93    fn name(&self) -> String {
94        String::from("3.2")
95    }
96
97    fn title(&self) -> String {
98        "Returning two-dimensional embedding vectors".to_string()
99    }
100
101    fn statement(&self) -> String {
102        let stmt = "Change the input arguments for the \
103        `MultiHeadAttentionWrapper(..., num_heads=2)` call such that the output \
104        context vectors are two-dimensional instead of four dimensional while \
105        keeping the setting `num_heads=2`. Hint: You don’t have to modify the \
106        class implementation; you just have to change one of the other input arguments.";
107        stmt.to_string()
108    }
109
110    fn main(&self) -> Result<()> {
111        use crate::listings::ch03::MultiHeadAttentionWrapper;
112        use candle_core::{DType, Device, Module, Tensor};
113        use candle_nn::{VarBuilder, VarMap};
114
115        let (d_in, d_out) = (3_usize, 1_usize); // set d_out to 1 to get desired final dim
116        let varmap = VarMap::new();
117        let dev = Device::cuda_if_available(0)?;
118        let vb = VarBuilder::from_varmap(&varmap, DType::F32, &dev);
119        let num_heads = 2_usize;
120        let mha =
121            MultiHeadAttentionWrapper::new(num_heads, d_in, d_out, 0.0_f32, false, vb.pp("mha"))?;
122
123        // create random input batch
124        let input_length = 6_usize;
125        let xs = Tensor::rand(0f32, 1f32, (input_length, d_in), vb.device())?;
126        let batch = Tensor::stack(&[&xs, &xs], 0)?;
127        println!("batch shape: {:?}", batch);
128
129        // run forward on mha
130        let context_vectors = mha.forward(&batch)?;
131        println!("context_vectors.shape: {:?}", context_vectors);
132        println!("context_vectors: {:?}", context_vectors.to_vec3::<f32>());
133        Ok(())
134    }
135}
136
137/// # Initializing GPT-2 size attention modules
138///
139/// #### Id
140/// 3.3
141///
142/// #### CLI command
143/// ```sh
144/// # without cuda
145/// cargo run exercise 3.3
146///
147/// # with cuda
148/// cargo run --features cuda exercise 3.3
149/// ```
150pub struct X3;
151
152impl Exercise for X3 {
153    fn name(&self) -> String {
154        String::from("3.3")
155    }
156
157    fn title(&self) -> String {
158        "Initializing GPT-2 size attention modules".to_string()
159    }
160
161    fn statement(&self) -> String {
162        let stmt = "Using the `MultiHeadAttention` class, initialize a \
163        multi-head attention module that has the same number of attention heads \
164        as the smallest GPT-2 model (12 attention heads). Also ensure that you \
165        use the respective input and output embedding sizes similar to GPT-2 \
166        (768 dimensions). Note that the smallest GPT-2 model supports a context \
167        length of 1,024 tokens.";
168        stmt.to_string()
169    }
170
171    fn main(&self) -> Result<()> {
172        use crate::listings::ch03::MultiHeadAttention;
173        use candle_core::{DType, Device};
174        use candle_nn::{VarBuilder, VarMap};
175
176        let (d_in, d_out, num_heads) = (768_usize, 768_usize, 12_usize); // set d_out to 1 to get desired final dim
177        let varmap = VarMap::new();
178        let dev = Device::cuda_if_available(0)?;
179        let vb = VarBuilder::from_varmap(&varmap, DType::F32, &dev);
180        let mha = MultiHeadAttention::new(d_in, d_out, 0.0_f32, num_heads, false, vb.pp("mha"))?;
181
182        println!("mha.num_heads: {:?}", mha.num_heads());
183        println!("mha.head_dim: {:?}", mha.head_dim());
184        println!("mha.w_query.shape: {:?}", mha.w_query().weight().dims());
185        Ok(())
186    }
187}