chonkier 0.0.2

🦛 Chonkie, now in Rust 🦀: No-nonsense, ultra-fast, ultra-light chunking library
Documentation
// A simple character tokenizer that tokenizes a string into a list of characters
// which are then encoded into a list of integers.
use crate::tokenizer::Tokenizer;
pub struct CharacterTokenizer {}

// Some utility functions for the CharacterTokenizer struct
impl CharacterTokenizer {
    // Create a new character tokenizer
    pub fn new() -> Self {
        // Create a new character tokenizer
        CharacterTokenizer {}
    }
}

// Define the core encode and decode for the CharacterTokenizer struct
impl Tokenizer for CharacterTokenizer {
    fn encode(&self, text: &str) -> Vec<usize> {
        // Encode the string while making sure to add any tokens that are not in the vocab
        text.chars().map(|c| c as usize).collect::<Vec<usize>>()
    }

    fn decode(&self, tokens: &[usize]) -> String {
        tokens
            .iter()
            .map(|&token| char::from_u32(token as u32)
            .unwrap_or(''))
            .collect::<String>()
    }

    fn count_tokens(&self, text: &str) -> usize {
        text.chars().count()
    }
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn test_tokenizer_encode() {
        let tokenizer = CharacterTokenizer::new();
        assert_eq!(tokenizer.encode(&"abc".to_string()), vec![97, 98, 99]);
        assert_eq!(
            tokenizer.encode(&"abcde".to_string()),
            vec![97, 98, 99, 100, 101]
        );
    }

    #[test]
    fn test_tokenizer_decode() {
        let tokenizer = CharacterTokenizer::new();
        assert_eq!(tokenizer.encode(&"abc".to_string()), vec![97, 98, 99]);
        assert_eq!(tokenizer.decode(&vec![97, 98, 99]), "abc");
    }
}