Skip to main content

Tokenizer

Trait Tokenizer 

Source
pub trait Tokenizer: Send + Sync {
    // Required methods
    fn encode(&self, text: &str, add_special: bool) -> Result<Vec<TokenId>>;
    fn decode(&self, tokens: &[TokenId], skip_special: bool) -> Result<String>;
    fn decode_incremental(
        &self,
        prev: &[TokenId],
        next: TokenId,
    ) -> Result<String>;
    fn vocab_size(&self) -> usize;
    fn special_tokens(&self) -> &SpecialTokens;
    fn token_id(&self, text: &str) -> Option<TokenId>;
    fn token_text(&self, token_id: TokenId) -> Option<&str>;
    fn info(&self) -> TokenizerInfo;

    // Provided methods
    fn is_special_token(&self, token_id: TokenId) -> bool { ... }
    fn apply_chat_template(&self, messages: &[ChatMessage]) -> Result<String> { ... }
}
Expand description

Core tokenizer trait for encoding/decoding operations

Required Methods§

Source

fn encode(&self, text: &str, add_special: bool) -> Result<Vec<TokenId>>

Encode text to token IDs

Source

fn decode(&self, tokens: &[TokenId], skip_special: bool) -> Result<String>

Decode token IDs to text

Source

fn decode_incremental(&self, prev: &[TokenId], next: TokenId) -> Result<String>

Incremental decode: given previous tokens and new token, return only the new text This is crucial for streaming applications to avoid re-decoding all tokens

Source

fn vocab_size(&self) -> usize

Get vocabulary size

Source

fn special_tokens(&self) -> &SpecialTokens

Get special tokens configuration

Source

fn token_id(&self, text: &str) -> Option<TokenId>

Get token ID for a specific text (if exists in vocabulary)

Source

fn token_text(&self, token_id: TokenId) -> Option<&str>

Get text for a specific token ID

Source

fn info(&self) -> TokenizerInfo

Get tokenizer information

Provided Methods§

Source

fn is_special_token(&self, token_id: TokenId) -> bool

Check if token is a special token

Source

fn apply_chat_template(&self, messages: &[ChatMessage]) -> Result<String>

Apply chat template if supported

Implementors§