Skip to main content

entrenar/tokenizer/
traits.rs

1//! Tokenizer trait definition.
2
3use super::error::Result;
4
5/// Token ID type
6pub type TokenId = u32;
7
8/// Tokenizer trait
9pub trait Tokenizer: Send + Sync {
10    /// Train the tokenizer on a corpus
11    fn train(&mut self, corpus: &[&str]) -> Result<()>;
12
13    /// Encode text to token IDs
14    fn encode(&self, text: &str) -> Result<Vec<TokenId>>;
15
16    /// Decode token IDs to text
17    fn decode(&self, ids: &[TokenId]) -> Result<String>;
18
19    /// Get vocabulary size
20    fn vocab_size(&self) -> usize;
21
22    /// Check if tokenizer is trained
23    fn is_trained(&self) -> bool;
24
25    /// Get token for ID
26    fn id_to_token(&self, id: TokenId) -> Option<&str>;
27
28    /// Get ID for token
29    fn token_to_id(&self, token: &str) -> Option<TokenId>;
30}