llms_from_scratch_rs/exercises/
ch03.rs1use crate::Exercise;
4use anyhow::Result;
5
6pub struct X1;
20
21impl Exercise for X1 {
22 fn name(&self) -> String {
23 String::from("3.1")
24 }
25
26 fn title(&self) -> String {
27 "Comparing `SelfAttention_v1` and `SelfAttention_v2`".to_string()
28 }
29
30 fn statement(&self) -> String {
31 let stmt = "Note that `nn.Linear` in `SelfAttention_v2` uses a \
32 different weight initialization scheme as `nn.Parameter(torch.rand(d_in, d_out))` \
33 used in `SelfAttention_v1`, which causes both mechanisms to produce \
34 different results. To check that both implementations, `SelfAttention_v1` \
35 and `SelfAttention_v2`, are otherwise similar, we can transfer the \
36 weight matrices from a `SelfAttention_v2` object to a `SelfAttention_v1`, \
37 such that both objects then produce the same results. Your task is to \
38 correctly assign the weights from an instance of `SelfAttention_v2` to \
39 an instance of `SelfAttention_v1`. To do this, you need to understand \
40 the relationship between the weights in both versions. (Hint: `nn.Linear` \
41 stores the weight matrix in a transposed form.) After the assignment, \
42 you should observe that both instances produce the same outputs.";
43 stmt.to_string()
44 }
45
46 fn main(&self) -> Result<()> {
47 use crate::listings::ch03::{SelfAttentionV1, SelfAttentionV2};
48 use candle_core::{DType, Device, Module, Tensor};
49 use candle_nn::{VarBuilder, VarMap};
50
51 let (d_in, d_out) = (3_usize, 5_usize);
52 let varmap = VarMap::new();
53 let dev = Device::cuda_if_available(0)?;
54 let vb = VarBuilder::from_varmap(&varmap, DType::F32, &dev);
55 let attn_v2_layer = SelfAttentionV2::new(d_in, d_out, false, vb.pp("attn_v2"))?;
56 let attn_v1_layer = SelfAttentionV1 {
57 w_query: attn_v2_layer.w_query().weight().t()?,
58 w_key: attn_v2_layer.w_key().weight().t()?,
59 w_value: attn_v2_layer.w_value().weight().t()?,
60 scaling: 1. / (attn_v2_layer.w_key().weight().dims()[0] as f64).sqrt(),
61 };
62
63 let input_length = 10_usize;
64 let xs = Tensor::rand(0f32, 1f32, (input_length, d_in), &dev)?;
65 let context_vectors_from_v1 = attn_v1_layer.forward(&xs)?;
66 let context_vectors_from_v2 = attn_v2_layer.forward(&xs)?;
67
68 println!(
69 "Context vectors from SelfAttention V1 and V2 are equal when using same weights: {}",
70 context_vectors_from_v1.to_vec2::<f32>()?
71 == context_vectors_from_v2.to_vec2::<f32>()?
72 );
73 Ok(())
74 }
75}
76
77pub struct X2;
91
92impl Exercise for X2 {
93 fn name(&self) -> String {
94 String::from("3.2")
95 }
96
97 fn title(&self) -> String {
98 "Returning two-dimensional embedding vectors".to_string()
99 }
100
101 fn statement(&self) -> String {
102 let stmt = "Change the input arguments for the \
103 `MultiHeadAttentionWrapper(..., num_heads=2)` call such that the output \
104 context vectors are two-dimensional instead of four dimensional while \
105 keeping the setting `num_heads=2`. Hint: You don’t have to modify the \
106 class implementation; you just have to change one of the other input arguments.";
107 stmt.to_string()
108 }
109
110 fn main(&self) -> Result<()> {
111 use crate::listings::ch03::MultiHeadAttentionWrapper;
112 use candle_core::{DType, Device, Module, Tensor};
113 use candle_nn::{VarBuilder, VarMap};
114
115 let (d_in, d_out) = (3_usize, 1_usize); let varmap = VarMap::new();
117 let dev = Device::cuda_if_available(0)?;
118 let vb = VarBuilder::from_varmap(&varmap, DType::F32, &dev);
119 let num_heads = 2_usize;
120 let mha =
121 MultiHeadAttentionWrapper::new(num_heads, d_in, d_out, 0.0_f32, false, vb.pp("mha"))?;
122
123 let input_length = 6_usize;
125 let xs = Tensor::rand(0f32, 1f32, (input_length, d_in), vb.device())?;
126 let batch = Tensor::stack(&[&xs, &xs], 0)?;
127 println!("batch shape: {:?}", batch);
128
129 let context_vectors = mha.forward(&batch)?;
131 println!("context_vectors.shape: {:?}", context_vectors);
132 println!("context_vectors: {:?}", context_vectors.to_vec3::<f32>());
133 Ok(())
134 }
135}
136
137pub struct X3;
151
152impl Exercise for X3 {
153 fn name(&self) -> String {
154 String::from("3.3")
155 }
156
157 fn title(&self) -> String {
158 "Initializing GPT-2 size attention modules".to_string()
159 }
160
161 fn statement(&self) -> String {
162 let stmt = "Using the `MultiHeadAttention` class, initialize a \
163 multi-head attention module that has the same number of attention heads \
164 as the smallest GPT-2 model (12 attention heads). Also ensure that you \
165 use the respective input and output embedding sizes similar to GPT-2 \
166 (768 dimensions). Note that the smallest GPT-2 model supports a context \
167 length of 1,024 tokens.";
168 stmt.to_string()
169 }
170
171 fn main(&self) -> Result<()> {
172 use crate::listings::ch03::MultiHeadAttention;
173 use candle_core::{DType, Device};
174 use candle_nn::{VarBuilder, VarMap};
175
176 let (d_in, d_out, num_heads) = (768_usize, 768_usize, 12_usize); let varmap = VarMap::new();
178 let dev = Device::cuda_if_available(0)?;
179 let vb = VarBuilder::from_varmap(&varmap, DType::F32, &dev);
180 let mha = MultiHeadAttention::new(d_in, d_out, 0.0_f32, num_heads, false, vb.pp("mha"))?;
181
182 println!("mha.num_heads: {:?}", mha.num_heads());
183 println!("mha.head_dim: {:?}", mha.head_dim());
184 println!("mha.w_query.shape: {:?}", mha.w_query().weight().dims());
185 Ok(())
186 }
187}