Trait Tokenizer

Source

pub trait Tokenizer: Send + Sync {
    // Required methods
    fn encode(&self, text: &str, add_special: bool) -> Result<Vec<TokenId>>;
    fn decode(&self, tokens: &[TokenId], skip_special: bool) -> Result<String>;
    fn decode_incremental(
        &self,
        prev: &[TokenId],
        next: TokenId,
    ) -> Result<String>;
    fn vocab_size(&self) -> usize;
    fn special_tokens(&self) -> &SpecialTokens;
    fn token_id(&self, text: &str) -> Option<TokenId>;
    fn token_text(&self, token_id: TokenId) -> Option<&str>;
    fn info(&self) -> TokenizerInfo;

    // Provided methods
    fn is_special_token(&self, token_id: TokenId) -> bool { ... }
    fn apply_chat_template(&self, messages: &[ChatMessage]) -> Result<String> { ... }
}

Expand description

Core tokenizer trait for encoding/decoding operations

Required Methods§

Source

fn encode(&self, text: &str, add_special: bool) -> Result<Vec<TokenId>>

Encode text to token IDs

Source

fn decode(&self, tokens: &[TokenId], skip_special: bool) -> Result<String>

Decode token IDs to text

Source

fn decode_incremental(&self, prev: &[TokenId], next: TokenId) -> Result<String>

Incremental decode: given previous tokens and new token, return only the new text This is crucial for streaming applications to avoid re-decoding all tokens

Source