use crate::error::DatasetResult;
pub trait Tokenizer: Send + Sync {
fn encode(&self, text: &str) -> DatasetResult<Vec<u32>>;
fn decode(&self, ids: &[u32]) -> DatasetResult<String>;
fn vocab_size(&self) -> usize;
fn count_tokens(&self, text: &str) -> DatasetResult<usize> {
Ok(self.encode(text)?.len())
}
fn encode_batch(&self, texts: &[&str]) -> DatasetResult<Vec<Vec<u32>>> {
texts.iter().map(|t| self.encode(t)).collect()
}
fn decode_batch(&self, ids_batch: &[&[u32]]) -> DatasetResult<Vec<String>> {
ids_batch.iter().map(|ids| self.decode(ids)).collect()
}
fn special_tokens(&self) -> Vec<(String, u32)> {
Vec::new()
}
}
#[cfg(feature = "hf-tokenizer")]
pub mod hf;
#[cfg(feature = "tiktoken")]
pub mod tiktoken;
#[cfg(feature = "hf-tokenizer")]
pub use hf::HfTokenizer;
#[cfg(feature = "tiktoken")]
pub use tiktoken::TiktokenTokenizer;