pub trait Tokenizer: Send + Sync {
// Required methods
fn train(&mut self, corpus: &[&str]) -> Result<()>;
fn encode(&self, text: &str) -> Result<Vec<TokenId>>;
fn decode(&self, ids: &[TokenId]) -> Result<String>;
fn vocab_size(&self) -> usize;
fn is_trained(&self) -> bool;
fn id_to_token(&self, id: TokenId) -> Option<&str>;
fn token_to_id(&self, token: &str) -> Option<TokenId>;
}Expand description
Tokenizer trait
Required Methods§
Sourcefn vocab_size(&self) -> usize
fn vocab_size(&self) -> usize
Get vocabulary size
Sourcefn is_trained(&self) -> bool
fn is_trained(&self) -> bool
Check if tokenizer is trained
Sourcefn id_to_token(&self, id: TokenId) -> Option<&str>
fn id_to_token(&self, id: TokenId) -> Option<&str>
Get token for ID
Sourcefn token_to_id(&self, token: &str) -> Option<TokenId>
fn token_to_id(&self, token: &str) -> Option<TokenId>
Get ID for token