entrenar/tokenizer/traits.rs
1//! Tokenizer trait definition.
2
3use super::error::Result;
4
5/// Token ID type
6pub type TokenId = u32;
7
8/// Tokenizer trait
9pub trait Tokenizer: Send + Sync {
10 /// Train the tokenizer on a corpus
11 fn train(&mut self, corpus: &[&str]) -> Result<()>;
12
13 /// Encode text to token IDs
14 fn encode(&self, text: &str) -> Result<Vec<TokenId>>;
15
16 /// Decode token IDs to text
17 fn decode(&self, ids: &[TokenId]) -> Result<String>;
18
19 /// Get vocabulary size
20 fn vocab_size(&self) -> usize;
21
22 /// Check if tokenizer is trained
23 fn is_trained(&self) -> bool;
24
25 /// Get token for ID
26 fn id_to_token(&self, id: TokenId) -> Option<&str>;
27
28 /// Get ID for token
29 fn token_to_id(&self, token: &str) -> Option<TokenId>;
30}