gpt-model 0.1.0

//! Runtime wrappers for running inference
//! on a GPT model saved in ONNX format.
use anyhow::Result;
use ndarray::{Array, ArrayD, ArrayViewMut, Axis, Ix1, Ix2, Ix3, Ix6};
use rand::{distributions::WeightedIndex, prelude::Distribution};
use tract_onnx::prelude::{
    tvec, DatumExt, Framework, Graph, InferenceModelExt, SimplePlan, Tensor, TypedFact, TypedOp,
};

/// Alias for the type returned by Tract
/// for an optimized and strongly-typed
/// runnable ML model.
type OptimizedOnnxModel =
    SimplePlan<TypedFact, Box<dyn TypedOp>, Graph<TypedFact, Box<dyn TypedOp>>>;

/// Alias for the shape of the GPT-2 `tokens` input tensor.
type TokensInput = Array<i32, Ix2>;

/// Alias for the shape of the GPT-2 `token_predictions` output tensor.
///
/// The shape axes correspond to:
/// - `0`: Input batch size
/// - `1`: Input token sequence length
/// - `2`: Model vocabulary size
type InferenceOutput = Array<f32, Ix3>;

/// Alias for the shape of the GPT-2 `token_embeddings` output tensor.
///
/// The shape axes correspond to:
/// - `0`: Input batch size
/// - `1`: Model layer count
/// - `2`: Key / value pairs (always `2` "rows")
/// - `3`: Model head count
/// - `4`: Input token sequence length
/// - `5`: Model embeddings per layer / model head count
type HiddenLayersOutput = Array<f32, Ix6>;

/// Token vocabulary size of the GPT-2 models supported
/// by this library.
const GPT2_VOCABULARY_SIZE: usize = 50257;

/// Number of layers used by the GPT-2 models supported
/// by this library.
const GPT2_LAYER_COUNT: usize = 12;

/// Number of heads used by the GPT-2 models supported
/// by this library.
const GPT2_HEAD_COUNT: usize = 12;

/// Number of embeddings used by each layer of the
/// GPT-2 models supported by this library.
pub const GPT2_EMBEDDING_SIZE: usize = 768;

/// Sampling temperatuore gradient which affects
/// the entropy of inferences.
///
/// Temperatures of:
/// - `0.0` will result in no entropy (deterministic outputs).
/// - `1.0` will defer to the model's internal entropy.
/// - `> 1` will exaggerate the model's entropy.
///
/// In general, _higher_ temperatures result in more
/// "creative" samples of the model's inferences.
const SAMPLE_TEMPERATURE: f32 = 0.9;

/// Sampling filter which restricts samples
/// of the model's inference for a token to
/// the `P` most confident inferences.
///
/// P-values of:
/// - `0.0` will select only the most likely inference.
/// - `1.0` will select all inferences (i.e., the entire
///   vocabulary of the model).
///
/// In general, _lower_ P-values result in
/// more "creative" samples of the model's inferences.
const SAMPLE_MIN_P_VALUE: f32 = 0.5;

/// The GPT-2 natural langage ML model.
///
/// ## Example Usage
///
/// ```rust
/// # use gpt::tokenizer::Tokenizer;
/// # use gpt::model::Gpt2Model;
/// #
/// # let bpe_path = "./gpt-2-model/saved_models/124M_vocab.bpe";
/// # let encoder_path = "./gpt-2-model/saved_models/124M_encoder.json";
/// # let model_path = "./gpt-2-model/saved_models/gpt-2-124M.onnx";
/// #
/// # let batch_size = 1;
/// # let sequence_length = 128;
/// #
/// // Load tokenizer and GPT-2 model.
/// let tokenizer = Tokenizer::new(bpe_path, encoder_path);
/// let gpt_model = Gpt2Model::new(model_path, batch_size, sequence_length).unwrap();
///
/// // Convert input text to a token sequence.
/// let text_in = "Horses aren't real; they can't hurt you.";
/// let (tokens_in, padding) = tokenizer.encode_to_length(text_in, sequence_length);
///
/// // Convert token sequence to an input tensor, and get
/// // an inference from the model.
/// let tensor_in = gpt_model.tensor_from_tokens(&[tokens_in]);
/// let (inference, hidden_layers) = gpt_model.infer(tensor_in);
///
/// // Generate the next tokens based on the inference,
/// // and convert the tokens to text.
/// let tokens_out = gpt_model.tokens_from_inference(inference, &[padding]);
/// let generated_text = tokenizer.decode(tokens_out);
///
/// // Bonus: Extract the embedding of the input text from
/// //        the hidden layers.
/// let text_embedding = gpt_model.embeddings_from_layers(&hidden_layers, &[padding], 11);
/// ```
pub struct Gpt2Model {
    /// The loaded ONNX model.
    model: OptimizedOnnxModel,

    /// The index of the model's token inference output.
    out_inference_index: usize,

    /// The index of the model's
    out_hidden_layers_index: usize,

    /// The number of token sequences
    /// (i.e., "sentences") given to
    /// the model during inference.
    batch_size: usize,

    /// The length of each token sequence
    /// (i.e., "sentence") given to the
    /// model during inference.
    sequence_length: usize,
}

impl Gpt2Model {
    /// Creates a new GPT-2 model from the ONNX
    /// model saved at `onnx_model_path`, with fixed
    /// `batch_size` and `sequence_length`.
    ///
    /// `batch_size` specifies the maximum number of
    /// texts ("token sequences") that can be processed
    /// during each inference request.
    ///
    /// `sequence_length` specifies the number of tokens
    /// that can be processed by the model in a single
    /// token sequence. Sequences will be truncated and/or
    /// padded to match this length.
    pub fn new(onnx_model_path: &str, batch_size: usize, sequence_length: usize) -> Result<Self> {
        // Load the model into memory.
        let mut model = tract_onnx::onnx()
            .with_ignore_output_shapes(true)
            .with_ignore_output_types(true)
            .model_for_path(onnx_model_path)?;

        // Configure shape of the input tokens.
        model.set_input_fact(0, i32::fact([batch_size, sequence_length]).into())?;

        // Configure shape of the output inferences.
        let out_inference = model
            .find_outlet_label("next_token_inferences")
            .expect("missing inference output");
        model.set_outlet_fact(
            out_inference,
            f32::fact([batch_size, sequence_length, GPT2_VOCABULARY_SIZE]).into(),
        )?;
        let out_inference_index = model
            .output_outlets()?
            .iter()
            .position(|o| o == &out_inference)
            .expect("missing inference output");

        // Configure shape of the output hidden layers.
        let out_hidden_layers = model
            .find_outlet_label("hidden_layers")
            .expect("missing hidden layers output");
        model.set_outlet_fact(
            out_hidden_layers,
            f32::fact([
                batch_size,
                GPT2_LAYER_COUNT,
                2,
                GPT2_HEAD_COUNT,
                sequence_length,
                GPT2_EMBEDDING_SIZE / GPT2_HEAD_COUNT,
            ])
            .into(),
        )?;
        let out_hidden_layers_index = model
            .output_outlets()?
            .iter()
            .position(|o| o == &out_hidden_layers)
            .expect("missing hidden layers output");

        // Prepare model for execution.
        let model = model.into_optimized()?;
        let model = model.into_runnable()?;

        Ok(Gpt2Model {
            model,
            out_inference_index,
            out_hidden_layers_index,
            batch_size,
            sequence_length,
        })
    }

    /// Converts a slice of one or more token sequences
    /// into a single tensor which may be passed into
    /// the GPT-2 model.
    ///
    /// ## Panics
    ///
    /// If `tokens` contains any token sequences not
    /// matching this model's `sequence_length`, or if
    /// the number of token sequences in `tokens` does
    /// not match this model's `batch_size`.
    pub fn tensor_from_tokens(&self, tokens: &[Vec<i32>]) -> TokensInput {
        assert_eq!(self.batch_size, tokens.len());

        TokensInput::from_shape_fn(
            (self.batch_size, self.sequence_length),
            |(batch_index, sequence_index)| tokens[batch_index][sequence_index],
        )
    }

    /// Runs the model to generate an inference for `tensor`.
    ///
    /// The returned tuple will contain `(inference, hidden_layers)`,
    /// where `inference` is a 3D tensor of shape
    /// `[batch_size, sequence_length, vocabulary size]`,
    /// and `hidden_layers` is a 6D tensor of shape
    /// `[batch_size, layers, 2, head count, sequence_length, embeddings per head].
    ///
    /// For most GPT-2 models, the vocabulary size is `50257`.
    ///
    /// For the 124M ("small") GPT-2 model, there will be
    /// `12` layers, `12` heads, and `64` embeddings per head,
    /// for a total of `768` embeddings per layer.
    pub fn infer(&self, tensor: TokensInput) -> (InferenceOutput, HiddenLayersOutput) {
        // Convert input into a concrete Tract tensor.
        let tensor: Tensor = tensor.into();

        // Run inference.
        let model_outputs = self.model.run(tvec!(tensor)).expect("inference");

        // Extract inference data.
        let inference = model_outputs[self.out_inference_index].clone();
        let hidden_layers = model_outputs[self.out_hidden_layers_index].clone();

        // Convert inference data to f32 arrays.
        let inference = (*inference).clone();
        let inference: ArrayD<f32> = inference.into_array().unwrap();
        let inference: InferenceOutput = inference.into_dimensionality().unwrap();
        let hidden_layers = (*hidden_layers).clone();
        let hidden_layers: ArrayD<f32> = hidden_layers.into_array().unwrap();
        let hidden_layers: HiddenLayersOutput = hidden_layers.into_dimensionality().unwrap();

        (inference, hidden_layers)
    }

    /// Returns the number of hidden layers within `hidden_layers`.
    pub fn count_layers(&self, hidden_layers: &HiddenLayersOutput) -> usize {
        hidden_layers.dim().1
    }

    /// Samples `inference` for the next
    /// token for each sequence in the batch.
    ///
    /// `tokens_padding` must be a slice of the
    /// same length as `batch_size`, where each
    /// element corresponds to the number of padding
    /// tokens added onto the input token sequence
    /// for that batch element.
    ///
    /// Returns a 1D tensor of shape `[batch_size]`,
    /// where each batch entry is the next token in a sequence.
    pub fn tokens_from_inference(
        &self,
        mut inference: InferenceOutput,
        tokens_padding: &[usize],
    ) -> Vec<i32> {
        // Extract and check inference dimensions.
        let batch_size = inference.dim().0;
        let sequence_length = inference.dim().1;
        assert_eq!(self.batch_size, batch_size);
        assert_eq!(self.sequence_length, sequence_length);
        assert_eq!(batch_size, tokens_padding.len());

        // Iterate over all token sequences in
        // the batch.
        let mut token_indexes = Vec::with_capacity(batch_size);
        let axis = Axis(0);
        for (index, padding) in tokens_padding.iter().enumerate().take(batch_size) {
            let mut inference = inference.index_axis_mut(axis, index);
            let sample = sample_nucleus(
                &mut inference,
                Self::last_token_inference_index(sequence_length, *padding),
            );
            token_indexes.push(sample as i32);
        }

        token_indexes
    }

    /// Post-processes `hidden_layers` to extract
    /// the embedding of each sequence in the batch.
    ///
    /// Returns a 2D tensor of shape `[batch_size, embeddings per layer]`,
    /// where each batch entry is the embedding of the
    /// entire _input_ sequence for that entry.
    ///
    /// For the 124M ("small") GPT-2 model, there
    /// are `768` embeddings per layer.
    ///
    /// `tokens_padding` must be a slice of the
    /// same length as `batch_size`, where each
    /// element corresponds to the number of padding
    /// tokens added onto the input token sequence
    /// for that batch element.
    pub fn embeddings_from_layers(
        &self,
        hidden_layers: &HiddenLayersOutput,
        tokens_padding: &[usize],
        hidden_layer_index: usize,
    ) -> Array<f32, Ix2> {
        // Extract dimensional data from the layers.
        let batch_size = hidden_layers.dim().0;
        assert_eq!(2, hidden_layers.dim().2);
        let head_count = hidden_layers.dim().3;
        let token_sequence_length = hidden_layers.dim().4;
        let embeddings_per_head = hidden_layers.dim().5;
        let embeddings_per_layer = embeddings_per_head * head_count;

        // Iterate over all final hidden layers in the batch.
        let mut embeddings = Array::zeros((0, embeddings_per_layer));
        for (index, padding) in tokens_padding.iter().enumerate().take(batch_size) {
            // Restrict view to the hidden layers for this batch.
            let hidden_layer = hidden_layers.index_axis(Axis(0), index);

            // TODO: This line restricts the view to the _last_
            // hidden layer of this batch. However, "lower" (earlier)
            // layers may perform better in tasks where over-contextualization
            // of embeddings isn't desirable:
            // https://kawine.github.io/blog/nlp/2020/02/03/contextual.html
            let hidden_layer = hidden_layer.index_axis(Axis(0), hidden_layer_index);

            // Restrict view to the "value" axis of the hidden layer.
            let hidden_layer = hidden_layer.index_axis(Axis(0), 1);

            // Concatenate embeddings across all GPT model "heads."
            let mut embedding = Vec::with_capacity(embeddings_per_layer);
            for head in 0..head_count {
                // Restrict view to the current head.
                let hidden_layer = hidden_layer.index_axis(Axis(0), head);

                // Restrict view to the last non-padding token.
                let token_index = Self::last_token_inference_index(token_sequence_length, *padding);
                let hidden_layer = hidden_layer.index_axis(Axis(0), token_index);

                embedding.extend(hidden_layer.iter());
            }
            let embedding: Array<f32, Ix1> = Array::from_vec(embedding);

            // Copy embeddings into output.
            embeddings.push_row(embedding.view()).expect("row");
        }

        embeddings
    }

    /// Returns the last index which should
    /// contain an inference on non-padding
    /// token data.
    ///
    /// In the case where `token_padding == token_sequence_length`,
    /// `0` will be returned.
    pub fn last_token_inference_index(token_sequence_length: usize, token_padding: usize) -> usize {
        if token_padding >= token_sequence_length {
            0
        } else {
            token_sequence_length - token_padding - 1
        }
    }
}

/// Performs nucleus sampling of an `inference`
/// of shape `[sequence_length, vocabulary]`
/// for the token at `token_index` in the sequence.
fn sample_nucleus(inference: &mut ArrayViewMut<f32, Ix2>, token_index: usize) -> usize {
    // Restrict our view to the inference of the `token_index`th token.
    let mut inference = inference.index_axis_mut(Axis(0), token_index);

    // Apply sampling temperature.
    inference.mapv_inplace(|score| score / SAMPLE_TEMPERATURE);

    // Each value in `inference` is a "score" of how likely
    // it is a specific token comes _after_ the token
    // that inferrence ran on.
    //
    // Here, we create a clone of the inference and sort it
    // from the highest to lowest scores.
    let mut sorted_scores: Vec<f32> = inference.iter().copied().collect();
    sorted_scores.sort_by(|a, b| a.total_cmp(b).reverse());
    let mut sorted_scores: Array<f32, Ix1> = sorted_scores.into();
    assert!(sorted_scores[0] > sorted_scores[sorted_scores.len() - 1]);

    // A clone of the original scores will be needed later,
    // when performing the final sampling of the scores.
    let original_sorted_scores = sorted_scores.clone();

    // Softmax the sorted scores.
    softmax(&mut sorted_scores.view_mut());

    // Cumulative sum the sorted scores.
    sorted_scores.accumulate_axis_inplace(Axis(0), |&prev, curr| *curr += prev);

    // Find the lowest score in `k`, which
    // is the set of scores that have a
    // cumulative probability greater
    // than the sampling P-value.
    //
    // Because the scores are sorted
    // in descending order, we can use
    // the count of all scores `<=` the
    // sampling P-value, minus one,
    // as the index of the lowest
    // score in `k`.
    //
    // In "Top-K" sampling, we would
    // stop processing at this stage
    // and randomly sample from the set
    // of scores in `k`.
    let iter = sorted_scores
        .iter()
        .filter(|score| score <= &&SAMPLE_MIN_P_VALUE);
    let k_min_index = iter.count().saturating_sub(1);
    let k_min_score = original_sorted_scores[k_min_index];

    // "Mask" or "drop out" all scores lower
    // than `k_min_score` by replacing them
    // with a tiny number.
    //
    // This masking will cause these scores
    // to be effectively removed from consideration
    // during sampling when we softmax the scores.
    inference.mapv_inplace(|score| {
        if score < k_min_score {
            return -1e10;
        }

        score
    });

    // Calculate the softmax of the scores.
    softmax(&mut inference.view_mut());

    // Draw a weighted sample from the inference.
    // Although not _technically_ a multinomial sample,
    // the resulting inferences are good enough!
    let inference = inference.mapv(|score| score as f64);
    let multinomial = WeightedIndex::new(inference.view()).unwrap();

    multinomial.sample(&mut rand::thread_rng())
}

/// Calculates the `softmax` of a 1-dimensional
/// `tensor` in-place, replacing its contents
/// with their softmax'ed equivalents.
///
/// ## What's a `softmax`?
///
/// The `softmax` function converts a vector
/// (1-dimensional tensor, or "array") of `n`
/// values into a vector of `n` values _that
/// sum to `1.0`_.
///
/// Regardless of what values are in the original
/// inputs, the output will always contain values
/// in the range of `0.0` to `1.0`. This property
/// makes `softmax` similar to a normalization
/// function that can turn arbitrary data into
/// a `0-1` scale.
///
/// _Unlike_ a "typical" normalization function,
/// which maps values to a `0-1` scale based on
/// some known lower and upper bound (e.g., mapping
/// a any byte in the range `0-255` to `0-1`),
/// `softmax` maps values based on their relative
/// "weights".
///
/// For example, a vector containing
/// `(-0.3, 1,000,000)` might produce a `softmax`
/// vector of `(0.1, 0.9)` (fyi, these numbers
/// are for illustration and not technically correct).
/// This mapping shows that the first element was
/// _very_ small compared to the second element
/// in the input vector.
fn softmax(tensor: &mut ArrayViewMut<f32, Ix1>) {
    // Shift all values to handle under/over flow.
    let max_value = *tensor.iter().max_by(|a, b| a.total_cmp(b)).unwrap();
    tensor.mapv_inplace(|value| value - max_value);

    // Perform the softmax operation, which:
    //
    // 1. Replaces each value `v` with the value of
    //    Euler's constant raised to that value. We'll
    //    call each of these new values `e^v`.
    //
    // 2. Sums all `e^v`. We'll call this sum `sum(e^v)`.
    //
    // 3. Replace each `e^v` with `e^v / sum(e^v)`.
    //
    // The final values will be equivalent to their
    // normalized probabilities on a 0-1 scale that sums to 1.
    tensor.mapv_inplace(|value| value.exp());
    let sum_exps = tensor.sum();
    tensor.mapv_inplace(|value| value / sum_exps);

    // Handle rounding errors to ensure all values sum to 1.
    let sum_values = tensor.sum();
    tensor.mapv_inplace(|value| value / sum_values);
}

#[cfg(test)]
pub mod test {
    use crate::tokenizer::{self, Tokenizer};

    use super::*;

    // Paths to OpenAI training data for the 124M (smallest) GPT-2 model.
    const MODEL_PATH: &str = "./gpt-2-model/saved_models/gpt-2-124M.onnx";
    const BPE_PATH: &str = "./gpt-2-model/saved_models/124M_vocab.bpe";
    const ENCODER_PATH: &str = "./gpt-2-model/saved_models/124M_encoder.json";

    // Expected model hyperparameters.
    const BATCH_SIZE: usize = 1;
    const SEQUENCE_LENGTH: usize = 128;

    // Sample input text for inference.
    const INPUT_TEXT_STR: &str =
        "GPT-2 is a machine learning model for natural language-processing;";

    #[test]
    fn infers_and_samples_sentence() {
        // Load model.
        let model = Gpt2Model::new(MODEL_PATH, BATCH_SIZE, SEQUENCE_LENGTH).expect("load failed");

        // Load tokenizer.
        let tokenizer = Tokenizer::new(BPE_PATH, ENCODER_PATH);

        // Prepare initial set of tokens.
        let tokens = tokenizer.encode(INPUT_TEXT_STR);
        let mut all_tokens = tokens.clone();

        eprintln!("   Prompt: `{}`", INPUT_TEXT_STR);
        eprint!("Inference: ");

        // Predict the next full sentence from the model.
        let mut full_sentence = String::from(INPUT_TEXT_STR);
        for _ in 0..64 {
            // Prepare input tokens, padding as necessary.
            let mut inference_tokens = all_tokens.clone();
            let padding = SEQUENCE_LENGTH - inference_tokens.len();
            for _ in 0..padding {
                inference_tokens.push(tokenizer::PAD_TOKEN);
            }

            // Prepare inference tensor.
            let tensor = model.tensor_from_tokens(&[inference_tokens]);

            // Run inference.
            let (inference, hidden_layers) = model.infer(tensor);

            // Sample the next token in the sentence based on inference.
            let next_token = model.tokens_from_inference(inference, &[padding])[0];
            all_tokens.push(next_token);

            // Decode the token and add it to the sentence.
            let next_word = tokenizer.decode(vec![next_token]);
            full_sentence.push_str(&next_word);

            eprint!("{}", next_word);

            // Quit early if the model emits a full-stop.
            // In these tests, we always embed from the final
            // ("highest") hidden layer.
            let hidden_layer_index = model.count_layers(&hidden_layers) - 1;
            if full_sentence.ends_with('.') {
                eprintln!();
                eprintln!(
                    "Final inference embedding: {:?}",
                    model.embeddings_from_layers(&hidden_layers, &[padding], hidden_layer_index)
                );
                break;
            }

            assert_eq!(tokenizer.decode(all_tokens.clone()), full_sentence);
        }
    }
}