llms_from_scratch_rs/exercises/
ch03.rs

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
//! Exercises from Chapter 3

use crate::Exercise;
use anyhow::Result;

/// # Comparing `SelfAttention_v1` and `SelfAttention_v2`
///
/// #### Id
/// 3.1
///
/// #### CLI command
/// ```sh
/// # without cuda
/// cargo run exercise 3.1
///
/// # with cuda
/// cargo run --features cuda exercise 3.1
/// ```
pub struct X1;

impl Exercise for X1 {
    fn name(&self) -> String {
        String::from("3.1")
    }

    fn title(&self) -> String {
        "Comparing `SelfAttention_v1` and `SelfAttention_v2`".to_string()
    }

    fn statement(&self) -> String {
        let stmt = "Note that `nn.Linear` in `SelfAttention_v2` uses a \
        different weight initialization scheme as `nn.Parameter(torch.rand(d_in, d_out))` \
        used in `SelfAttention_v1`, which causes both mechanisms to produce \
        different results. To check that both implementations, `SelfAttention_v1` \
        and `SelfAttention_v2`, are otherwise similar, we can transfer the \
        weight matrices from a `SelfAttention_v2` object to a `SelfAttention_v1`, \
        such that both objects then produce the same results. Your task is to \
        correctly assign the weights from an instance of `SelfAttention_v2` to \
        an instance of `SelfAttention_v1`. To do this, you need to understand \
        the relationship between the weights in both versions. (Hint: `nn.Linear` \
        stores the weight matrix in a transposed form.) After the assignment, \
        you should observe that both instances produce the same outputs.";
        stmt.to_string()
    }

    fn main(&self) -> Result<()> {
        use crate::listings::ch03::{SelfAttentionV1, SelfAttentionV2};
        use candle_core::{DType, Device, Module, Tensor};
        use candle_nn::{VarBuilder, VarMap};

        let (d_in, d_out) = (3_usize, 5_usize);
        let varmap = VarMap::new();
        let dev = Device::cuda_if_available(0)?;
        let vb = VarBuilder::from_varmap(&varmap, DType::F32, &dev);
        let attn_v2_layer = SelfAttentionV2::new(d_in, d_out, false, vb.pp("attn_v2"))?;
        let attn_v1_layer = SelfAttentionV1 {
            w_query: attn_v2_layer.w_query().weight().t()?,
            w_key: attn_v2_layer.w_key().weight().t()?,
            w_value: attn_v2_layer.w_value().weight().t()?,
            scaling: 1. / (attn_v2_layer.w_key().weight().dims()[0] as f64).sqrt(),
        };

        let input_length = 10_usize;
        let xs = Tensor::rand(0f32, 1f32, (input_length, d_in), &dev)?;
        let context_vectors_from_v1 = attn_v1_layer.forward(&xs)?;
        let context_vectors_from_v2 = attn_v2_layer.forward(&xs)?;

        println!(
            "Context vectors from SelfAttention V1 and V2 are equal when using same weights: {}",
            context_vectors_from_v1.to_vec2::<f32>()?
                == context_vectors_from_v2.to_vec2::<f32>()?
        );
        Ok(())
    }
}

/// # Returning two-dimensional embedding vectors
///
/// #### Id
/// 3.2
///
/// #### CLI command
/// ```sh
/// # without cuda
/// cargo run exercise 3.2
///
/// # with cuda
/// cargo run --features cuda exercise 3.2
/// ```
pub struct X2;

impl Exercise for X2 {
    fn name(&self) -> String {
        String::from("3.2")
    }

    fn title(&self) -> String {
        "Returning two-dimensional embedding vectors".to_string()
    }

    fn statement(&self) -> String {
        let stmt = "Change the input arguments for the \
        `MultiHeadAttentionWrapper(..., num_heads=2)` call such that the output \
        context vectors are two-dimensional instead of four dimensional while \
        keeping the setting `num_heads=2`. Hint: You don’t have to modify the \
        class implementation; you just have to change one of the other input arguments.";
        stmt.to_string()
    }

    fn main(&self) -> Result<()> {
        use crate::listings::ch03::MultiHeadAttentionWrapper;
        use candle_core::{DType, Device, Module, Tensor};
        use candle_nn::{VarBuilder, VarMap};

        let (d_in, d_out) = (3_usize, 1_usize); // set d_out to 1 to get desired final dim
        let varmap = VarMap::new();
        let dev = Device::cuda_if_available(0)?;
        let vb = VarBuilder::from_varmap(&varmap, DType::F32, &dev);
        let num_heads = 2_usize;
        let mha =
            MultiHeadAttentionWrapper::new(num_heads, d_in, d_out, 0.0_f32, false, vb.pp("mha"))?;

        // create random input batch
        let input_length = 6_usize;
        let xs = Tensor::rand(0f32, 1f32, (input_length, d_in), vb.device())?;
        let batch = Tensor::stack(&[&xs, &xs], 0)?;
        println!("batch shape: {:?}", batch);

        // run forward on mha
        let context_vectors = mha.forward(&batch)?;
        println!("context_vectors.shape: {:?}", context_vectors);
        println!("context_vectors: {:?}", context_vectors.to_vec3::<f32>());
        Ok(())
    }
}

/// # Initializing GPT-2 size attention modules
///
/// #### Id
/// 3.3
///
/// #### CLI command
/// ```sh
/// # without cuda
/// cargo run exercise 3.3
///
/// # with cuda
/// cargo run --features cuda exercise 3.3
/// ```
pub struct X3;

impl Exercise for X3 {
    fn name(&self) -> String {
        String::from("3.3")
    }

    fn title(&self) -> String {
        "Initializing GPT-2 size attention modules".to_string()
    }

    fn statement(&self) -> String {
        let stmt = "Using the `MultiHeadAttention` class, initialize a \
        multi-head attention module that has the same number of attention heads \
        as the smallest GPT-2 model (12 attention heads). Also ensure that you \
        use the respective input and output embedding sizes similar to GPT-2 \
        (768 dimensions). Note that the smallest GPT-2 model supports a context \
        length of 1,024 tokens.";
        stmt.to_string()
    }

    fn main(&self) -> Result<()> {
        use crate::listings::ch03::MultiHeadAttention;
        use candle_core::{DType, Device};
        use candle_nn::{VarBuilder, VarMap};

        let (d_in, d_out, num_heads) = (768_usize, 768_usize, 12_usize); // set d_out to 1 to get desired final dim
        let varmap = VarMap::new();
        let dev = Device::cuda_if_available(0)?;
        let vb = VarBuilder::from_varmap(&varmap, DType::F32, &dev);
        let mha = MultiHeadAttention::new(d_in, d_out, 0.0_f32, num_heads, false, vb.pp("mha"))?;

        println!("mha.num_heads: {:?}", mha.num_heads());
        println!("mha.head_dim: {:?}", mha.head_dim());
        println!("mha.w_query.shape: {:?}", mha.w_query().weight().dims());
        Ok(())
    }
}