oxidized_transformers/tokenizers/
tokenizer.rs

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
use super::pieces::PiecesWithIds;
use crate::{error::BoxedError, repository::repo::Repo};

/// Input for encoding with a tokenizer.
pub enum TokenizerEncodeInput<I>
where
    I: AsRef<str>,
{
    RawString(I),
    // TODO: add chunked input
}

impl From<String> for TokenizerEncodeInput<String> {
    fn from(s: String) -> Self {
        TokenizerEncodeInput::RawString(s)
    }
}

impl From<&str> for TokenizerEncodeInput<String> {
    fn from(s: &str) -> Self {
        TokenizerEncodeInput::RawString(s.to_owned())
    }
}

/// Trait implemented by all tokenizers.
pub trait Tokenizer {
    /// Split one or more texts into pieces.
    ///
    /// * input - Sequences to tokenize. If the sequences are
    ///   strings, they are automatically converted to chunks.
    ///
    /// Returns: Pieces in each sequence.
    fn encode<V, I>(&self, input: V) -> Result<PiecesWithIds, BoxedError>
    where
        V: AsRef<[TokenizerEncodeInput<I>]>,
        I: AsRef<str>;

    /// Reconstruct string sequences from piece identifiers.
    ///
    /// * input - The piece identifiers to reconstruct the strings from.
    /// * skip_special_pieces - Skip special pieces during decoding.
    ///
    /// Returns: The decoded strings.
    fn decode<V, I>(&self, input: V, skip_special_pieces: bool) -> Result<Vec<String>, BoxedError>
    where
        V: AsRef<[I]>,
        I: AsRef<[u32]>;

    /// Get the ID for a single piece.
    ///
    /// * piece - The piece to look up the identifier for.
    ///
    /// Returns: The piece identifier, `None` when the piece
    /// is unknown.
    fn piece_to_id(&self, piece: impl AsRef<str>) -> Option<u32>;

    /// Get the end-of-sequence piece.
    ///
    /// Returns: The end-of-sequence piece or
    /// `None` when this piece is not defined.
    fn eos_piece(&self) -> Option<&str>;
}

/// Trait implemented by tokenizers that can be loaded from a repository.
pub trait FromRepo
where
    Self: Sized + Tokenizer,
{
    /// Load a tokenizer from a repository.
    ///
    /// * repo - The repository to load the tokenizer from.
    ///
    /// Returns: The tokenizer loaded from the repository.
    fn from_repo(repo: &impl Repo) -> Result<Self, BoxedError>;
}