Skip to main content

TokenizerCapabilities

ferrum_interfaces::tokenizer

Trait TokenizerCapabilities

pub trait TokenizerCapabilities: Tokenizer {
    // Required methods
    fn token_probability(&self, text: &str, token_id: TokenId) -> Option<f32>;
    fn get_prefix_tokens(&self, prefix: &str) -> Result<Vec<TokenId>>;
    fn can_extend(&self, tokens: &[TokenId], next_token: TokenId) -> bool;
    fn token_type(&self, token_id: TokenId) -> TokenType;
    fn normalize_text(&self, text: &str) -> String;
    fn pre_tokenize(&self, text: &str) -> Vec<String>;
}

Expand description

Advanced tokenizer capabilities

Required Methods§

fn token_probability(&self, text: &str, token_id: TokenId) -> Option<f32>

Get token probability/likelihood for text

fn get_prefix_tokens(&self, prefix: &str) -> Result<Vec<TokenId>>

Get all possible tokens for a prefix

fn can_extend(&self, tokens: &[TokenId], next_token: TokenId) -> bool

Check if sequence can be extended with token

fn token_type(&self, token_id: TokenId) -> TokenType

Get token type (word, subword, punctuation, etc.)

fn normalize_text(&self, text: &str) -> String

Normalize text before tokenization

fn pre_tokenize(&self, text: &str) -> Vec<String>

Pre-tokenize text (split into words/subwords)

Implementors§