pub trait TokenizerCapabilities: Tokenizer {
// Required methods
fn token_probability(&self, text: &str, token_id: TokenId) -> Option<f32>;
fn get_prefix_tokens(&self, prefix: &str) -> Result<Vec<TokenId>>;
fn can_extend(&self, tokens: &[TokenId], next_token: TokenId) -> bool;
fn token_type(&self, token_id: TokenId) -> TokenType;
fn normalize_text(&self, text: &str) -> String;
fn pre_tokenize(&self, text: &str) -> Vec<String>;
}Expand description
Advanced tokenizer capabilities
Required Methods§
Sourcefn token_probability(&self, text: &str, token_id: TokenId) -> Option<f32>
fn token_probability(&self, text: &str, token_id: TokenId) -> Option<f32>
Get token probability/likelihood for text
Sourcefn get_prefix_tokens(&self, prefix: &str) -> Result<Vec<TokenId>>
fn get_prefix_tokens(&self, prefix: &str) -> Result<Vec<TokenId>>
Get all possible tokens for a prefix
Sourcefn can_extend(&self, tokens: &[TokenId], next_token: TokenId) -> bool
fn can_extend(&self, tokens: &[TokenId], next_token: TokenId) -> bool
Check if sequence can be extended with token
Sourcefn token_type(&self, token_id: TokenId) -> TokenType
fn token_type(&self, token_id: TokenId) -> TokenType
Get token type (word, subword, punctuation, etc.)
Sourcefn normalize_text(&self, text: &str) -> String
fn normalize_text(&self, text: &str) -> String
Normalize text before tokenization
Sourcefn pre_tokenize(&self, text: &str) -> Vec<String>
fn pre_tokenize(&self, text: &str) -> Vec<String>
Pre-tokenize text (split into words/subwords)