minbpe 0.1.0

Port of Andrej Karpathy's minbpe to Rust
Documentation
#[cfg(all(test, feature = "tiktoken_tests"))]
mod tests {
    use std::collections::HashSet;

    use lazy_static::lazy_static;
    use minbpe::{GPT4Tokenizer, RegexTokenizerTrait, Token};
    use proptest::prelude::*;
    use tiktoken_rs::{cl100k_base, CoreBPE};

    lazy_static! {
        static ref SPECIAL_TOKENS: HashSet<&'static str> = HashSet::new();
        static ref TIKTOKEN_ENC: CoreBPE = cl100k_base().unwrap();
        static ref GPT4_TOKENIZER: GPT4Tokenizer = GPT4Tokenizer::default();
    }

    fn test_one(s: &str) {
        let special_tokens = HashSet::new();

        let tiktoken_ids = TIKTOKEN_ENC.encode(s, special_tokens);
        let tiktoken_tokens: Vec<Token> = tiktoken_ids.iter().map(|&id| id as Token).collect();

        let gpt4_tokenizer_tokens = GPT4_TOKENIZER.encode(s);

        assert_eq!(tiktoken_tokens, gpt4_tokenizer_tokens);
    }

    #[test]
    fn test_high_char() {
        test_one("\u{1e01b}%SΣ");
    }

    proptest! {
        #[test]
        #[allow(unused_must_use)]
        fn gpt4_tokenizer_matches_tiktoken(s in "\\PC*") {
          test_one(&s);
        }
    }
}