use crate::utils::corpus_tracker::CorpusTracker;
use crate::tokenizers::benchmark::dev::traits::TokenizerTrainingMetrics;
pub type TokenizerError = String;
pub trait TokenIdType: Copy + Eq + std::hash::Hash + 'static {
fn from_usize(n: usize) -> Self;
}
impl TokenIdType for u32 {
fn from_usize(n: usize) -> Self { n as u32 }
}
impl TokenIdType for u64 {
fn from_usize(n: usize) -> Self { n as u64 }
}
impl TokenIdType for usize {
fn from_usize(n: usize) -> Self { n }
}
pub trait Tokenizer {
type TokenId: TokenIdType;
fn name(&self) -> &'static str;
fn corpus(&self) -> &CorpusTracker;
fn corpus_mut(&mut self) -> &mut CorpusTracker;
fn vocab_size(&self) -> usize;
fn train(&mut self, data_path: &str, corpus_name: &str) -> Result<TokenizerTrainingMetrics, TokenizerError>;
fn encode(&self, text: &str) -> Vec<Self::TokenId>;
fn decode(&self, tokens: &[Self::TokenId]) -> String;
fn save(&self, path: &str) -> Result<(), TokenizerError>;
fn load(&mut self, path: &str) -> Result<(), TokenizerError>;
fn vocab_tokens(&self) -> Vec<String>;
fn size(&self) -> usize;
}