oxidized_transformers/tokenizers/tokenizer.rs
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75
use super::pieces::PiecesWithIds;
use crate::{error::BoxedError, repository::repo::Repo};
/// Input for encoding with a tokenizer.
pub enum TokenizerEncodeInput<I>
where
I: AsRef<str>,
{
RawString(I),
// TODO: add chunked input
}
impl From<String> for TokenizerEncodeInput<String> {
fn from(s: String) -> Self {
TokenizerEncodeInput::RawString(s)
}
}
impl From<&str> for TokenizerEncodeInput<String> {
fn from(s: &str) -> Self {
TokenizerEncodeInput::RawString(s.to_owned())
}
}
/// Trait implemented by all tokenizers.
pub trait Tokenizer {
/// Split one or more texts into pieces.
///
/// * input - Sequences to tokenize. If the sequences are
/// strings, they are automatically converted to chunks.
///
/// Returns: Pieces in each sequence.
fn encode<V, I>(&self, input: V) -> Result<PiecesWithIds, BoxedError>
where
V: AsRef<[TokenizerEncodeInput<I>]>,
I: AsRef<str>;
/// Reconstruct string sequences from piece identifiers.
///
/// * input - The piece identifiers to reconstruct the strings from.
/// * skip_special_pieces - Skip special pieces during decoding.
///
/// Returns: The decoded strings.
fn decode<V, I>(&self, input: V, skip_special_pieces: bool) -> Result<Vec<String>, BoxedError>
where
V: AsRef<[I]>,
I: AsRef<[u32]>;
/// Get the ID for a single piece.
///
/// * piece - The piece to look up the identifier for.
///
/// Returns: The piece identifier, `None` when the piece
/// is unknown.
fn piece_to_id(&self, piece: impl AsRef<str>) -> Option<u32>;
/// Get the end-of-sequence piece.
///
/// Returns: The end-of-sequence piece or
/// `None` when this piece is not defined.
fn eos_piece(&self) -> Option<&str>;
}
/// Trait implemented by tokenizers that can be loaded from a repository.
pub trait FromRepo
where
Self: Sized + Tokenizer,
{
/// Load a tokenizer from a repository.
///
/// * repo - The repository to load the tokenizer from.
///
/// Returns: The tokenizer loaded from the repository.
fn from_repo(repo: &impl Repo) -> Result<Self, BoxedError>;
}