llms_from_scratch_rs/exercises/
ch02.rs

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
//! Exercises from Chapter 2

use crate::Exercise;
use anyhow::Result;

/// # Byte pair encoding of unknown words
///
/// #### Id
/// 2.1
///
/// #### CLI command
/// ```sh
/// # without cuda
/// cargo run exercise 2.1
///
/// # with cuda
/// cargo run --features cuda exercise 2.1
/// ```
pub struct X1;

impl Exercise for X1 {
    fn name(&self) -> String {
        String::from("2.1")
    }

    fn title(&self) -> String {
        "Byte pair encoding of unknown words".to_string()
    }

    fn statement(&self) -> String {
        let stmt = "Try the BPE tokenizer from the tiktoken library on the \
        unknown words 'Akwirw ier' and print the individual token IDs. Then, \
        call the decode function on each of the resulting integers in this list \
        to reproduce the mapping shown in figure 2.11. Lastly, call the decode \
        method on the token IDs to check whether it can reconstruct the \
        original input, 'Akwirw ier.'";
        stmt.to_string()
    }

    fn main(&self) -> Result<()> {
        use tiktoken_rs::get_bpe_from_model;

        let tokenizer = get_bpe_from_model("gpt2")?;
        let token_ids = tokenizer.encode_with_special_tokens("Akwirw ier");
        println!("token ids: {:?}", token_ids);

        let decoded_text = tokenizer.decode(token_ids)?;
        println!("decoded text: {}", decoded_text);
        Ok(())
    }
}

/// # Data loaders with different strides and context sizes
///
/// #### Id
/// 2.2
///
/// #### CLI command
/// ```sh
/// # without cuda
/// cargo run exercise 2.2
///
/// # with cuda
/// cargo run --features cuda exercise 2.2
/// ```
pub struct X2;

impl Exercise for X2 {
    fn name(&self) -> String {
        String::from("2.2")
    }

    fn title(&self) -> String {
        "Data loaders with different strides and context sizes".to_string()
    }

    fn statement(&self) -> String {
        let stmt = "To develop more intuition for how the data loader works, \
        try to run it with different settings such as `max_length=2` and \
        `stride=2`, and `max_length=8` and `stride=2`.";
        stmt.to_string()
    }

    fn main(&self) -> Result<()> {
        use crate::listings::ch02::{create_dataloader_v1, DataLoader};
        use std::fs;

        let raw_text = fs::read_to_string("data/the-verdict.txt").expect("Unable to read the file");
        let max_length = 4_usize;
        let stride = 2_usize;
        let shuffle = false;
        let drop_last = false;
        let batch_size = 2_usize;
        let data_loader = create_dataloader_v1(
            &raw_text[..],
            batch_size,
            max_length,
            stride,
            shuffle,
            drop_last,
        );

        let mut batch_iter = data_loader.batcher();
        match batch_iter.next() {
            Some(Ok((inputs, targets))) => {
                println!(
                    "inputs: {:?}\n\ntargets: {:?}",
                    inputs.to_vec2::<u32>(),
                    targets.to_vec2::<u32>()
                );
            }
            Some(Err(err)) => panic!("{}", err),
            None => panic!("None"),
        }
        Ok(())
    }
}