Skip to main content

TokenizerCapabilities

Trait TokenizerCapabilities 

Source
pub trait TokenizerCapabilities: Tokenizer {
    // Required methods
    fn token_probability(&self, text: &str, token_id: TokenId) -> Option<f32>;
    fn get_prefix_tokens(&self, prefix: &str) -> Result<Vec<TokenId>>;
    fn can_extend(&self, tokens: &[TokenId], next_token: TokenId) -> bool;
    fn token_type(&self, token_id: TokenId) -> TokenType;
    fn normalize_text(&self, text: &str) -> String;
    fn pre_tokenize(&self, text: &str) -> Vec<String>;
}
Expand description

Advanced tokenizer capabilities

Required Methods§

Source

fn token_probability(&self, text: &str, token_id: TokenId) -> Option<f32>

Get token probability/likelihood for text

Source

fn get_prefix_tokens(&self, prefix: &str) -> Result<Vec<TokenId>>

Get all possible tokens for a prefix

Source

fn can_extend(&self, tokens: &[TokenId], next_token: TokenId) -> bool

Check if sequence can be extended with token

Source

fn token_type(&self, token_id: TokenId) -> TokenType

Get token type (word, subword, punctuation, etc.)

Source

fn normalize_text(&self, text: &str) -> String

Normalize text before tokenization

Source

fn pre_tokenize(&self, text: &str) -> Vec<String>

Pre-tokenize text (split into words/subwords)

Implementors§