gpt-model 0.1.0

//! Pure Rust implementation of the GPT-2
//! byte-pair encoder (aka "text tokenizer").
use std::{collections::HashMap, fs, vec};

use fancy_regex::Regex;
use serde::Deserialize;

/// Pattern used to match encodable UTF-8 text in unencoded text.
const ENCODABLE_UTF8_PATTERN: &str =
    r"'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+";

/// Token `50256` is used as the padding token, which
/// corresponds to the `<|endoftext|>` token in the
/// OpenAI GPT-2 encoder.
pub const PAD_TOKEN: i32 = 50256;

pub const END_OF_TEXT_TOKEN: i32 = 50256;
pub const END_OF_TEXT_STRING: &str = "<|endoftext|>";

/// Tokenizer which converts strings into
/// token sequences consumable by the
/// GPT-2 model, and vice-versa.
///
/// This tokenizer loads its configuration
/// from the original OpenAI GPT-2 encoder
/// and vocabulary "byte-pair encoding" (BPE).
pub struct Tokenizer {
    /// Byte-pair encoding "ranks",
    /// obtained from an existing
    /// `*_vocab.bpe` file.
    bpe_ranks: HashMap<(String, String), u32>,

    /// Mapping of UTF-8 bytes to tokens,
    /// obtained via computation on start-up.
    ///
    /// A "UTF-8 byte" is a literal `u8`
    /// representing a unicode-encodable byte,
    /// and a "token" is a character (like `c`).
    utf8_bytes_to_char_tokens: HashMap<u8, char>,

    /// Reverse of `bytes_to_tokens`
    /// for decoding.
    char_tokens_to_utf8_bytes: HashMap<char, u8>,

    /// Mapping of tokens to their indexes,
    /// typically obtained from an existing
    /// `*_encoder.json` file.
    ///
    /// A "token" is a character (like `c`)
    /// or a word fragment (like `ing`), and
    /// an index is a number that represents
    /// that token.
    tokens_to_indexes: HashMap<String, i32>,

    /// Reverse of `tokens_to_indexes`
    /// for decoding.
    indexes_to_tokens: HashMap<i32, String>,
}

impl Tokenizer {
    /// Creates a new in-memory tokenizer
    /// from the BPE file at `bpe_path`
    /// and the character encoding file at `encoder_path`.
    pub fn new(bpe_path: &str, encoder_path: &str) -> Self {
        // Parse byte-pair encoding lines into tuples.
        let bpe_str = fs::read_to_string(bpe_path).expect("wah");
        let mut bpe_rank_tuples = Vec::new();
        for line in bpe_str.lines().skip(1) {
            let mut split = line.split_whitespace();
            bpe_rank_tuples.push((
                split.next().expect("k").to_string(),
                split.next().expect("v").to_string(),
            ));
        }

        // Build byte-pair encoding ranks,
        // where each tuple is mapped to its index
        // in the original byte-pair encoding file.
        let mut bpe_ranks = HashMap::new();
        for (tuple, rank) in bpe_rank_tuples.iter().zip(0..bpe_rank_tuples.len() as u32) {
            bpe_ranks.insert(tuple.clone(), rank);
        }

        // Parse encoder JSON.
        let encoder_str = fs::read_to_string(encoder_path).expect("wah");
        let encoder_json: EncoderJson = serde_json::from_str(&encoder_str).expect("wah");
        let tokens_to_indexes = encoder_json.token_indexes;
        let mut indexes_to_tokens = HashMap::new();
        for (k, v) in &tokens_to_indexes {
            indexes_to_tokens.insert(*v, k.clone());
        }

        // Compute UTF-8 byte-token maps.
        let (utf8_bytes_to_char_tokens, char_tokens_to_utf8_bytes) = create_utf8_char_maps();

        Self {
            bpe_ranks,
            utf8_bytes_to_char_tokens,
            char_tokens_to_utf8_bytes,
            tokens_to_indexes,
            indexes_to_tokens,
        }
    }

    /// Encodes `text` into a token sequence,
    /// truncating and/or "right-padding" the encoded
    /// token sequence to fit `token_sequence_length`,
    /// using [PAD_TOKEN] as the padding token.
    ///
    /// The returned tuple contains `(token_sequence, padding_length)`,
    /// where `padding_length` is the number of padding tokens
    /// in `token_sequence`. If the length of `token_sequence` before
    /// truncation exceeds `token_sequence_length`, `padding_length`
    /// will always be zero.
    ///
    /// ## Left vs. Right Padding
    ///
    /// Common wisdom in the ML community is to "pad-left"
    /// on natural language models like GPT-2; that is,
    /// by adding padding tokens to the _front_ of the input
    /// tokens until they fit a required input length.
    ///
    /// However, this method "pads-right" by adding padding
    /// tokens to the _end_ of the input tokens.
    ///
    /// Right-padding works because GPT-2 never looks "ahead"
    /// (to the right) of its inputs, and so the right-padding
    /// will not influence the inference results of any tokens
    /// to the left.
    ///
    /// Conversely, left-padding on GPT-2 only works if an
    /// attention mask is used, which tells the GPT-2 model
    /// to ignore certain tokens (like the padding tokens).
    /// However, attention masking is slightly more complicated
    /// to implement (albeit more efficient); therefore, this
    /// implementation does not use it.
    pub fn encode_to_length(&self, text: &str, token_sequence_length: usize) -> (Vec<i32>, usize) {
        let mut token_sequence = self.encode(text);
        let padding_length = if token_sequence.len() > token_sequence_length {
            0
        } else {
            token_sequence_length - token_sequence.len()
        };

        // Truncate to maximum length; no-op if shorter
        // than max length.
        token_sequence.truncate(token_sequence_length);

        // Right-pad to maximum length; no-op if at
        // max length.
        while token_sequence.len() < token_sequence_length {
            token_sequence.push(PAD_TOKEN);
        }

        (token_sequence, padding_length)
    }

    /// Encodes `text` into a token sequence for
    /// consumption by the GPT-2 model.
    pub fn encode(&self, text: &str) -> Vec<i32> {
        let mut token_sequence = vec![];

        // Strip end of text token.
        let mut has_eot_token = false;
        let text = match text.ends_with(END_OF_TEXT_STRING) {
            true => {
                has_eot_token = true;
                text.trim_end_matches(END_OF_TEXT_STRING)
            }
            false => text,
        };

        // Find all encodable UTF-8 text.
        let utf8_pattern = Regex::new(ENCODABLE_UTF8_PATTERN).unwrap();
        for utf8_fragment in utf8_pattern.captures_iter(text) {
            let utf8_fragment = &utf8_fragment.unwrap()[0];

            // Convert token UTF-8 bytes to one or more tokens.
            // Note: Rust strings are UTF-8 by default.
            let mut token = String::new();
            for utf8_byte in utf8_fragment.as_bytes() {
                token.push(
                    *self
                        .utf8_bytes_to_char_tokens
                        .get(utf8_byte)
                        .expect("unexpected utf8 byte in input"),
                )
            }

            // Encode token into byte-pairs.
            let encoded_tokens = self.byte_pair_encode(&token);
            for encoded_token in encoded_tokens.split(' ') {
                let token_index = self
                    .tokens_to_indexes
                    .get(encoded_token)
                    .unwrap_or_else(|| {
                        panic!(
                            "unexpected bpe-token `{:?}` for token `{:?}` in input",
                            &encoded_token, &token
                        )
                    });
                token_sequence.push(*token_index);
            }
        }

        // If the end-of-text token was provided, add it to the text.
        if has_eot_token {
            token_sequence.push(END_OF_TEXT_TOKEN);
        }

        token_sequence
    }

    /// Decodes `token_sequence` into text.
    pub fn decode(&self, token_sequence: Vec<i32>) -> String {
        // Decode each token index into a token.
        let mut tokens = String::new();
        for token_index in token_sequence {
            let token = self
                .indexes_to_tokens
                .get(&token_index)
                .expect("unexpected token index in output");
            tokens.push_str(token);
        }

        // Decode tokens into UTF-8 bytes.
        let mut utf8_bytes = vec![];
        for token in tokens.chars() {
            let utf8_byte = self
                .char_tokens_to_utf8_bytes
                .get(&token)
                .expect("unexpected token in output");
            utf8_bytes.push(*utf8_byte);
        }

        // Decode UTf-8 bytes into a string.
        String::from_utf8_lossy(&utf8_bytes).to_string()
    }

    /// todo: Do...the thing?
    fn byte_pair_encode(&self, token: &str) -> String {
        let mut word: Vec<String> = token.chars().map(|c| c.to_string()).collect();
        let pairs = Self::get_symbol_pairs(&word);

        // If no pairs were generated, there's only
        // one actual token in the input, and thus
        // no work to do.
        if pairs.is_none() {
            return token.into();
        }
        let mut pairs = pairs.unwrap();

        // Perform encoding.
        loop {
            // Find the pair with the lowest rank.
            let min_pair = pairs.iter().min_by_key(|pair| {
                let pair = (pair.0.to_string(), pair.1.to_string());
                let rank = self.bpe_ranks.get(&pair).unwrap_or(&u32::MAX);
                rank
            });

            // If no known pair was found, we're done.
            // todo: Why?
            if min_pair.is_none() {
                break;
            }
            let min_pair = min_pair.unwrap();
            if !self.bpe_ranks.contains_key(min_pair) {
                break;
            }
            let (first, second) = min_pair;

            // todo: ???
            let mut new_word = vec![];
            let mut i = 0;
            while i < word.len() {
                // todo: ???
                if let Some(k) = word.iter().skip(i).position(|c| c == first) {
                    let k = i + k; // adjust for skip
                    new_word.extend_from_slice(&word[i..k]);
                    i = k;
                } else {
                    new_word.extend_from_slice(&word[i..]);
                    break;
                }

                // todo: ???
                if &word[i] == first && i < word.len() - 1 && &word[i + 1] == second {
                    new_word.push(first.clone() + second);
                    i += 2;

                // todo: ???
                } else {
                    new_word.push(word[i].clone());
                    i += 1;
                }
            }

            // todo: ???
            word = new_word;
            if word.len() == 1 {
                break;
            } else {
                // todo: different behavior from original
                //       to gracefully handle errors
                if let Some(new_pairs) = Self::get_symbol_pairs(&word) {
                    pairs = new_pairs;
                } else {
                    break;
                }
            }
        }

        // Convert word into space-separated tokens.
        let mut return_word = String::new();
        for i in 0..word.len() {
            return_word.push_str(&word[i]);
            if i + 1 < word.len() {
                return_word.push(' ');
            }
        }

        return_word
    }

    /// Returns the set of all _pairs_
    /// of unicode symbols in `word`,
    /// returning nothing if there are no pairs.
    fn get_symbol_pairs(word: &Vec<String>) -> Option<Vec<(String, String)>> {
        if word.len() < 2 {
            return None;
        }

        let mut pairs = vec![];
        let mut prev_char = &word[0];
        for character in &word[1..] {
            pairs.push((prev_char.to_string(), character.to_string()));
            prev_char = character;
        }

        Some(pairs)
    }
}

/// Structure of the data in an encoder JSON file.
#[derive(Deserialize)]
struct EncoderJson {
    #[serde(flatten)]
    token_indexes: HashMap<String, i32>,
}

/// Returns a pair of mirrored maps of
/// bytes to unicode characters, and vice-versa.
///
/// todo: The original OpenAI encoder docs
/// are fairly sparse on what, exactly,
/// these maps do. E.g., they say things like:
///       
/// > To avoid that, we want lookup tables between
/// > utf-8 bytes and unicode strings. And avoids
/// > mapping to whitespace/control characters the
/// > bpe code barfs on.
///
/// Additional docs are needed for this function.
fn create_utf8_char_maps() -> (HashMap<u8, char>, HashMap<char, u8>) {
    let a = '!' as u32;
    let b = '~' as u32 + 1;
    let mut list_one = (a..b).collect::<Vec<_>>();
    let c = '¡' as u32;
    let d = '¬' as u32 + 1;
    let mut list_two = (c..d).collect::<Vec<_>>();
    let e = '®' as u32;
    let f = 'ÿ' as u32 + 1;
    let mut list_three = (e..f).collect::<Vec<_>>();

    // todo: called 'bs' in the OpenAI encoder source
    list_one.append(&mut list_two);
    list_one.append(&mut list_three);
    let mut utf8_bytes: Vec<u32> = Vec::with_capacity(list_one.len());
    for byte in list_one {
        utf8_bytes.push(byte);
    }

    // todo: called 'cs' in the OpenAI encoder source
    let mut utf8_char_codes = utf8_bytes.clone();

    // todo: ?
    let mut i = 0;
    for byte in 0u32..256 {
        if !utf8_bytes.contains(&byte) {
            utf8_bytes.push(byte);
            utf8_char_codes.push(256 + i);
            i += 1;
        }
    }

    // todo: ?
    let mut bytes_to_chars = HashMap::new();
    let mut chars_to_bytes = HashMap::new();
    for (b, c) in utf8_bytes.iter().zip(utf8_char_codes.iter()) {
        let utf8_byte = u8::try_from(*b).expect("wah");
        let utf8_char = char::from_u32(*c).expect("wah");
        bytes_to_chars.insert(utf8_byte, utf8_char);
        chars_to_bytes.insert(utf8_char, utf8_byte);
    }

    (bytes_to_chars, chars_to_bytes)
}

#[cfg(test)]
mod test {

    use super::*;

    // Paths to OpenAI training data for the 124M (smallest) GPT-2 model.
    const BPE_PATH: &str = "./gpt-2-model/saved_models/124M_vocab.bpe";
    const ENCODER_PATH: &str = "./gpt-2-model/saved_models/124M_encoder.json";

    // Sample input text for encoding, along
    // with expected encoded tokens.
    const INPUT_TEXT_STR: &str =
        "GPT-2 is a machine learning model for natural language-processing;";
    const INPUT_TEXT_TOKENS: &[i32] = &[
        38, 11571, 12, 17, 318, 257, 4572, 4673, 2746, 329, 3288, 3303, 12, 36948, 26,
    ];

    // Sample output tokens for decoding,
    // along with expected decoded text.
    const OUTPUT_TEXT_STR: &str = " it is a simple, high-performance, and scalable machine learning model that is designed to be used in real-world applications.";
    const OUTPUT_TEXT_TOKENS: &[i32] = &[
        340, 318, 257, 2829, 11, 1029, 12, 26585, 11, 290, 43865, 4572, 4673, 2746, 326, 318, 3562,
        284, 307, 973, 287, 1103, 12, 6894, 5479, 13,
    ];

    #[test]
    fn encode() {
        let tokenizer = Tokenizer::new(BPE_PATH, ENCODER_PATH);
        let tokens = tokenizer.encode(INPUT_TEXT_STR);
        assert_eq!(tokens, Vec::from(INPUT_TEXT_TOKENS));

        // Reverse encoding to double-check.
        let text = tokenizer.decode(tokens);
        assert_eq!(text, INPUT_TEXT_STR);
    }

    #[test]
    fn decode() {
        let tokenizer = Tokenizer::new(BPE_PATH, ENCODER_PATH);
        let text = tokenizer.decode(Vec::from(OUTPUT_TEXT_TOKENS));
        assert_eq!(text, OUTPUT_TEXT_STR);

        // Reverse decoding to double-check.
        let tokens = tokenizer.encode(&text);
        assert_eq!(tokens, Vec::from(OUTPUT_TEXT_TOKENS));
    }
}