bpe-openai 0.3.0

Prebuilt fast byte-pair encoders for OpenAI.
Documentation
use std::env;
use std::fs::File;
use std::io::Read;
use std::path::PathBuf;

use bpe::byte_pair_encoding::{read_tiktoken, BytePairEncoding};
use serde::Serialize;

fn main() {
    serialize_tiktoken_bpe(
        "cl100k_base",
        include_bytes!("data/cl100k_base.tiktoken.gz"),
        17846336922010275747,
    );
    serialize_tiktoken_bpe(
        "o200k_base",
        include_bytes!("data/o200k_base.tiktoken.gz"),
        17846336922010275747,
    );
    serialize_tiktoken_bpe(
        "voyage3_base",
        include_bytes!("data/voyage3_base.tiktoken.gz"),
        17846336922010275747,
    );
    println!("cargo::rerun-if-changed=build.rs");
}

fn serialize_tiktoken_bpe(name: &str, data: &[u8], hash_factor: u64) {
    let mut dec = flate2::read::GzDecoder::new(data);
    let mut tiktoken = String::new();
    dec.read_to_string(&mut tiktoken).expect("can decode data");
    let tokens = read_tiktoken(&tiktoken).expect("can read data");
    let mut path = PathBuf::from(env::var("OUT_DIR").expect("OUT_DIR is set during build"));
    path.push(format!("bpe_{name}.dict"));
    let file = File::create(path).expect("can create output file");
    let mut serializer = rmp_serde::Serializer::new(file);
    let bpe = BytePairEncoding::from_dictionary(tokens, Some(hash_factor));
    bpe.serialize(&mut serializer)
        .expect("serialization succeeds");
}