minbpe 0.1.0

Port of Andrej Karpathy's minbpe to Rust
Documentation
use std::fs;
use std::path::PathBuf;

use minbpe::GPT4Tokenizer;
use minbpe::RegexTokenizerTrait;

fn main() -> std::io::Result<()> {
    let file_path = PathBuf::from("tests/taylorswift.txt");

    // Pre-initialize the tokenizer
    println!("Pre-initializing the tokenizer...");
    let start = std::time::Instant::now();
    GPT4Tokenizer::initialize();
    let duration = start.elapsed();
    println!(
        "GPT4Tokenizer static initialization completed in: {:?}",
        duration
    );

    // Get default instance of the tokenizer
    println!("Getting a default instance of GPT4Tokenizer...");
    let start = std::time::Instant::now();
    let tokenizer = GPT4Tokenizer::default();
    let duration = start.elapsed();
    println!(
        "GPT4Tokenizer default instance construction completed in: {:?}",
        duration
    );

    // Read the input file
    println!("Reading file: {:?}...", file_path);
    let start = std::time::Instant::now();
    let text = fs::read_to_string(file_path)?;
    let duration = start.elapsed();
    println!(
        "Reading {} characters completed in: {:?}",
        text.len(),
        duration
    );

    // Timing the encoding process, optional.
    let start = std::time::Instant::now();
    let tokens = tokenizer.encode(&text);
    let duration = start.elapsed();

    println!("Encoding completed in: {:?}", duration);
    println!("Produced {} encoded tokens", tokens.len());

    Ok(())
}