Skip to main content

Tokenizer

Trait Tokenizer 

Source
pub trait Tokenizer: Send + Sync {
    // Required methods
    fn train(&mut self, corpus: &[&str]) -> Result<()>;
    fn encode(&self, text: &str) -> Result<Vec<TokenId>>;
    fn decode(&self, ids: &[TokenId]) -> Result<String>;
    fn vocab_size(&self) -> usize;
    fn is_trained(&self) -> bool;
    fn id_to_token(&self, id: TokenId) -> Option<&str>;
    fn token_to_id(&self, token: &str) -> Option<TokenId>;
}
Expand description

Tokenizer trait

Required Methods§

Source

fn train(&mut self, corpus: &[&str]) -> Result<()>

Train the tokenizer on a corpus

Source

fn encode(&self, text: &str) -> Result<Vec<TokenId>>

Encode text to token IDs

Source

fn decode(&self, ids: &[TokenId]) -> Result<String>

Decode token IDs to text

Source

fn vocab_size(&self) -> usize

Get vocabulary size

Source

fn is_trained(&self) -> bool

Check if tokenizer is trained

Source

fn id_to_token(&self, id: TokenId) -> Option<&str>

Get token for ID

Source

fn token_to_id(&self, token: &str) -> Option<TokenId>

Get ID for token

Dyn Compatibility§

This trait is dyn compatible.

In older versions of Rust, dyn compatibility was called "object safety".

Implementors§