pub trait Tokenizer: Send + Sync {
// Required methods
fn encode(
&self,
text: &str,
add_special: bool,
) -> Result<Vec<TokenId>, FerrumError>;
fn decode(
&self,
tokens: &[TokenId],
skip_special: bool,
) -> Result<String, FerrumError>;
fn decode_incremental(
&self,
prev: &[TokenId],
next: TokenId,
) -> Result<String, FerrumError>;
fn vocab_size(&self) -> usize;
fn special_tokens(&self) -> &SpecialTokens;
fn token_id(&self, text: &str) -> Option<TokenId>;
fn token_text(&self, token_id: TokenId) -> Option<&str>;
fn info(&self) -> TokenizerInfo;
// Provided methods
fn is_special_token(&self, token_id: TokenId) -> bool { ... }
fn apply_chat_template(
&self,
messages: &[ChatMessage],
) -> Result<String, FerrumError> { ... }
}Expand description
Core tokenizer trait for encoding/decoding operations
Required Methods§
Sourcefn encode(
&self,
text: &str,
add_special: bool,
) -> Result<Vec<TokenId>, FerrumError>
fn encode( &self, text: &str, add_special: bool, ) -> Result<Vec<TokenId>, FerrumError>
Encode text to token IDs
Sourcefn decode(
&self,
tokens: &[TokenId],
skip_special: bool,
) -> Result<String, FerrumError>
fn decode( &self, tokens: &[TokenId], skip_special: bool, ) -> Result<String, FerrumError>
Decode token IDs to text
Sourcefn decode_incremental(
&self,
prev: &[TokenId],
next: TokenId,
) -> Result<String, FerrumError>
fn decode_incremental( &self, prev: &[TokenId], next: TokenId, ) -> Result<String, FerrumError>
Incremental decode: given previous tokens and new token, return only the new text This is crucial for streaming applications to avoid re-decoding all tokens
Sourcefn vocab_size(&self) -> usize
fn vocab_size(&self) -> usize
Get vocabulary size
Sourcefn special_tokens(&self) -> &SpecialTokens
fn special_tokens(&self) -> &SpecialTokens
Get special tokens configuration
Sourcefn token_id(&self, text: &str) -> Option<TokenId>
fn token_id(&self, text: &str) -> Option<TokenId>
Get token ID for a specific text (if exists in vocabulary)
Sourcefn token_text(&self, token_id: TokenId) -> Option<&str>
fn token_text(&self, token_id: TokenId) -> Option<&str>
Get text for a specific token ID
Sourcefn info(&self) -> TokenizerInfo
fn info(&self) -> TokenizerInfo
Get tokenizer information
Provided Methods§
Sourcefn is_special_token(&self, token_id: TokenId) -> bool
fn is_special_token(&self, token_id: TokenId) -> bool
Check if token is a special token
Sourcefn apply_chat_template(
&self,
messages: &[ChatMessage],
) -> Result<String, FerrumError>
fn apply_chat_template( &self, messages: &[ChatMessage], ) -> Result<String, FerrumError>
Apply chat template if supported