use super::pieces::PiecesWithIds;
use crate::{error::BoxedError, repository::repo::Repo};
pub enum TokenizerEncodeInput<I>
where
I: AsRef<str>,
{
RawString(I),
}
impl From<String> for TokenizerEncodeInput<String> {
fn from(s: String) -> Self {
TokenizerEncodeInput::RawString(s)
}
}
impl From<&str> for TokenizerEncodeInput<String> {
fn from(s: &str) -> Self {
TokenizerEncodeInput::RawString(s.to_owned())
}
}
pub trait Tokenizer {
fn encode<V, I>(&self, input: V) -> Result<PiecesWithIds, BoxedError>
where
V: AsRef<[TokenizerEncodeInput<I>]>,
I: AsRef<str>;
fn decode<V, I>(&self, input: V, skip_special_pieces: bool) -> Result<Vec<String>, BoxedError>
where
V: AsRef<[I]>,
I: AsRef<[u32]>;
fn piece_to_id(&self, piece: impl AsRef<str>) -> Option<u32>;
fn eos_piece(&self) -> Option<&str>;
}
pub trait FromRepo
where
Self: Sized + Tokenizer,
{
fn from_repo(repo: &impl Repo) -> Result<Self, BoxedError>;
}