pub trait TokenizerEnv: Send {
// Required methods
fn tok_trie(&self) -> &TokTrie;
fn tokenize_bytes(&self, s: &[u8]) -> Vec<TokenId> ⓘ;
// Provided methods
fn tokenize_bytes_marker(&self, s: &[u8]) -> (Vec<TokenId>, usize) { ... }
fn tokenize(&self, s: &str) -> Vec<TokenId> ⓘ { ... }
fn tokenize_special(&self, s: &str) -> Vec<TokenId> ⓘ { ... }
fn eos_token(&self) -> TokenId { ... }
fn tokenize_is_canonical(&self) -> bool { ... }
}Required Methods§
Provided Methods§
Sourcefn tokenize_bytes_marker(&self, s: &[u8]) -> (Vec<TokenId>, usize)
fn tokenize_bytes_marker(&self, s: &[u8]) -> (Vec<TokenId>, usize)
Tokenize a given byte sequence. It will interpret text starting with SPECIAL_TOKEN_MARKER as special tokens. Returns tokens, and number of tokens are should never be re-tokenized (because they were specified using the special token marker).
Sourcefn tokenize(&self, s: &str) -> Vec<TokenId> ⓘ
fn tokenize(&self, s: &str) -> Vec<TokenId> ⓘ
Tokenize a string coming from user. It may or may not interpret <|special_tokens|> as special.
Sourcefn tokenize_special(&self, s: &str) -> Vec<TokenId> ⓘ
fn tokenize_special(&self, s: &str) -> Vec<TokenId> ⓘ
Tokenize a string. It will interpret <|special_tokens|> as special.
Sourcefn tokenize_is_canonical(&self) -> bool
fn tokenize_is_canonical(&self) -> bool
If this returns true, this tokenizer always returns canonical tokenizations and can be used for forcing tokens. Non-canonical tokenizers will typically just use TokTrie::greedy_tokenize().