use alloc::borrow::Cow;
use alloc::vec::Vec;
use core::borrow::Borrow;
use core::fmt::Debug;
use core::ops::Deref;
use crate::{Model, TokenId};
mod bytepair;
mod unigram;
mod wordpiece;
pub(crate) use bytepair::*;
pub(crate) use unigram::*;
pub(crate) use wordpiece::*;
#[non_exhaustive]
#[derive(Debug, thiserror::Error)]
pub enum EncodeError {
#[error("invalid piece {0:?}")]
InvalidPiece(Vec<u8>),
}
#[derive(Debug, Clone, PartialEq, Eq)]
pub(crate) struct TextPart<'a> {
pub text: Cow<'a, str>,
pub special: TokenId,
}
impl Borrow<[u8]> for TextPart<'_> {
#[inline(always)]
fn borrow(&self) -> &[u8] {
self.text.as_bytes()
}
}
impl Deref for TextPart<'_> {
type Target = [u8];
#[inline(always)]
fn deref(&self) -> &Self::Target {
self.text.as_bytes()
}
}
pub(crate) trait Encoder: Debug + Send + Sync + 'static {
fn encode(&self, text: &str, parts: &mut [TextPart]) -> Result<Vec<TokenId>, EncodeError>;
fn model(&self) -> Model;
}