Skip to main content

Tokenizer

Trait Tokenizer 

Source
pub trait Tokenizer: Send + Sync {
    // Required methods
    fn encode(
        &self,
        text: &str,
        add_special: bool,
    ) -> Result<Vec<TokenId>, FerrumError>;
    fn decode(
        &self,
        tokens: &[TokenId],
        skip_special: bool,
    ) -> Result<String, FerrumError>;
    fn decode_incremental(
        &self,
        prev: &[TokenId],
        next: TokenId,
    ) -> Result<String, FerrumError>;
    fn vocab_size(&self) -> usize;
    fn special_tokens(&self) -> &SpecialTokens;
    fn token_id(&self, text: &str) -> Option<TokenId>;
    fn token_text(&self, token_id: TokenId) -> Option<&str>;
    fn info(&self) -> TokenizerInfo;

    // Provided methods
    fn is_special_token(&self, token_id: TokenId) -> bool { ... }
    fn apply_chat_template(
        &self,
        messages: &[ChatMessage],
    ) -> Result<String, FerrumError> { ... }
}
Expand description

Core tokenizer trait for encoding/decoding operations

Required Methods§

Source

fn encode( &self, text: &str, add_special: bool, ) -> Result<Vec<TokenId>, FerrumError>

Encode text to token IDs

Source

fn decode( &self, tokens: &[TokenId], skip_special: bool, ) -> Result<String, FerrumError>

Decode token IDs to text

Source

fn decode_incremental( &self, prev: &[TokenId], next: TokenId, ) -> Result<String, FerrumError>

Incremental decode: given previous tokens and new token, return only the new text This is crucial for streaming applications to avoid re-decoding all tokens

Source

fn vocab_size(&self) -> usize

Get vocabulary size

Source

fn special_tokens(&self) -> &SpecialTokens

Get special tokens configuration

Source

fn token_id(&self, text: &str) -> Option<TokenId>

Get token ID for a specific text (if exists in vocabulary)

Source

fn token_text(&self, token_id: TokenId) -> Option<&str>

Get text for a specific token ID

Source

fn info(&self) -> TokenizerInfo

Get tokenizer information

Provided Methods§

Source

fn is_special_token(&self, token_id: TokenId) -> bool

Check if token is a special token

Source

fn apply_chat_template( &self, messages: &[ChatMessage], ) -> Result<String, FerrumError>

Apply chat template if supported

Implementors§