use crate::{
TokenType,
WCResult,
alloc::{
sync::Arc,
vec::Vec,
},
spanners::TextSpanner,
vocab::{
SpecialFilter,
SpecialVocab,
},
};
pub trait TokenEncoder<T: TokenType>: Send + Sync {
fn spanner(&self) -> &Arc<dyn TextSpanner>;
fn special_vocab(&self) -> &SpecialVocab<T>;
fn expected_bytes_per_token(&self) -> f32 {
self.spanner().expected_bytes_per_span()
}
fn expected_token_count(
&self,
text: &str,
) -> usize {
(text.len() as f32 / self.expected_bytes_per_token()) as usize
}
fn try_encode_append(
&self,
text: &str,
tokens: &mut Vec<T>,
special_filter: Option<&SpecialFilter>,
) -> WCResult<()>;
fn try_encode(
&self,
text: &str,
special_filter: Option<&SpecialFilter>,
) -> WCResult<Vec<T>> {
let capacity = self.expected_token_count(text) * 115 / 100;
let mut tokens = Vec::with_capacity(capacity);
self.try_encode_append(text, &mut tokens, special_filter)?;
Ok(tokens)
}
fn try_encode_batch(
&self,
batch: &[&str],
special_filter: Option<&SpecialFilter>,
) -> WCResult<Vec<Vec<T>>> {
batch
.iter()
.map(|s| self.try_encode(s, special_filter))
.collect()
}
}