use std::collections::HashMap;
pub trait SubwordTokenizer: Send + Sync {
fn tokenize(&self, text: &str) -> Vec<u32>;
}
#[deprecated(
since = "0.1.5",
note = "misleading name: this is a vocabulary lookup, not BPE. Use VocabTokenizer instead."
)]
#[derive(Debug, Clone, PartialEq, Eq)]
#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
pub struct BpeTokenizer {
vocab: HashMap<String, u32>,
}
#[allow(deprecated)]
impl BpeTokenizer {
pub fn from_vocab(vocab: HashMap<String, u32>) -> Self {
Self { vocab }
}
}
#[allow(deprecated)]
impl SubwordTokenizer for BpeTokenizer {
fn tokenize(&self, text: &str) -> Vec<u32> {
text.split_whitespace()
.filter_map(|word| self.vocab.get(word).copied())
.collect()
}
}
#[allow(deprecated)]
pub type VocabTokenizer = BpeTokenizer;