pub trait Tokenize: Send + Sync {
// Required methods
fn encode(&self, text: &str) -> Vec<u32>;
fn decode(&self, ids: &[u32]) -> Result<String, TokenizeError>;
fn vocab_size(&self) -> usize;
}Expand description
Common interface for all tokenizer backends.
Implemented by Tokenizer (BPE),
SentencePieceTokenizer (unigram), and
WordPieceTokenizer (WordPiece).
Required Methods§
Sourcefn decode(&self, ids: &[u32]) -> Result<String, TokenizeError>
fn decode(&self, ids: &[u32]) -> Result<String, TokenizeError>
Decode token IDs back to text.
Returns an error if any token ID is invalid.
Sourcefn vocab_size(&self) -> usize
fn vocab_size(&self) -> usize
Return the vocabulary size (number of distinct tokens).