pub struct Tokenizer { /* private fields */ }Expand description
Wrapper around tokenizers::Tokenizer and minijinja::Environment
providing more utilities.
Implementations§
Source§impl Tokenizer
impl Tokenizer
pub fn from_tokenizer(tokenizer: Tokenizer) -> Self
pub fn from_file(file: impl AsRef<Path>) -> Result<Self>
pub fn from_bytes(bytes: impl AsRef<[u8]>) -> Result<Self>
pub fn apply_chat_template<'a, I, R, T>( &'a mut self, model_template: String, args: ApplyChatTemplateArgs<'a, I, R, T>, ) -> Result<Vec<String>, Error>
pub fn apply_chat_template_and_encode<'a, I, R, T>( &mut self, model_template: String, args: ApplyChatTemplateArgs<'a, I, R, T>, ) -> Result<Vec<Encoding>, Error>
Methods from Deref<Target = TokenizerImpl<ModelWrapper, NormalizerWrapper, PreTokenizerWrapper, PostProcessorWrapper, DecoderWrapper>>§
Sourcepub fn with_normalizer(
&mut self,
normalizer: Option<impl Into<N>>,
) -> &mut TokenizerImpl<M, N, PT, PP, D>
pub fn with_normalizer( &mut self, normalizer: Option<impl Into<N>>, ) -> &mut TokenizerImpl<M, N, PT, PP, D>
Set the normalizer
Sourcepub fn get_normalizer(&self) -> Option<&N>
pub fn get_normalizer(&self) -> Option<&N>
Get the normalizer
Sourcepub fn with_pre_tokenizer(
&mut self,
pre_tokenizer: Option<impl Into<PT>>,
) -> &mut TokenizerImpl<M, N, PT, PP, D>
pub fn with_pre_tokenizer( &mut self, pre_tokenizer: Option<impl Into<PT>>, ) -> &mut TokenizerImpl<M, N, PT, PP, D>
Set the pre tokenizer
Sourcepub fn get_pre_tokenizer(&self) -> Option<&PT>
pub fn get_pre_tokenizer(&self) -> Option<&PT>
Get the pre tokenizer
Sourcepub fn with_post_processor(
&mut self,
post_processor: Option<impl Into<PP>>,
) -> &mut TokenizerImpl<M, N, PT, PP, D>
pub fn with_post_processor( &mut self, post_processor: Option<impl Into<PP>>, ) -> &mut TokenizerImpl<M, N, PT, PP, D>
Set the post processor
Sourcepub fn get_post_processor(&self) -> Option<&PP>
pub fn get_post_processor(&self) -> Option<&PP>
Get the post processor
Sourcepub fn with_decoder(
&mut self,
decoder: Option<impl Into<D>>,
) -> &mut TokenizerImpl<M, N, PT, PP, D>
pub fn with_decoder( &mut self, decoder: Option<impl Into<D>>, ) -> &mut TokenizerImpl<M, N, PT, PP, D>
Set the decoder
Sourcepub fn get_decoder(&self) -> Option<&D>
pub fn get_decoder(&self) -> Option<&D>
Get the decoder
Sourcepub fn with_model(
&mut self,
model: impl Into<M>,
) -> &mut TokenizerImpl<M, N, PT, PP, D>
pub fn with_model( &mut self, model: impl Into<M>, ) -> &mut TokenizerImpl<M, N, PT, PP, D>
Set the model
Sourcepub fn with_added_vocabulary(
&mut self,
added_vocabulary: AddedVocabulary,
) -> &mut TokenizerImpl<M, N, PT, PP, D>
pub fn with_added_vocabulary( &mut self, added_vocabulary: AddedVocabulary, ) -> &mut TokenizerImpl<M, N, PT, PP, D>
Set the added vocabulary.
Sourcepub fn get_added_vocabulary(&self) -> &AddedVocabulary
pub fn get_added_vocabulary(&self) -> &AddedVocabulary
Get the added vocabulary
Sourcepub fn with_truncation(
&mut self,
trunc: Option<TruncationParams>,
) -> Result<&mut TokenizerImpl<M, N, PT, PP, D>, Box<dyn Error + Sync + Send>>
pub fn with_truncation( &mut self, trunc: Option<TruncationParams>, ) -> Result<&mut TokenizerImpl<M, N, PT, PP, D>, Box<dyn Error + Sync + Send>>
Set the truncation parameters
Fails if stride is too high relative to max_length and post_processor.added_tokens()
Sourcepub fn get_truncation(&self) -> Option<&TruncationParams>
pub fn get_truncation(&self) -> Option<&TruncationParams>
Get the currently set truncation parameters
Sourcepub fn get_truncation_mut(&mut self) -> Option<&mut TruncationParams>
pub fn get_truncation_mut(&mut self) -> Option<&mut TruncationParams>
Get a mutable reference to the currently set truncation parameters
Sourcepub fn with_padding(
&mut self,
padding: Option<PaddingParams>,
) -> &mut TokenizerImpl<M, N, PT, PP, D>
pub fn with_padding( &mut self, padding: Option<PaddingParams>, ) -> &mut TokenizerImpl<M, N, PT, PP, D>
Set the padding parameters
Sourcepub fn get_padding(&self) -> Option<&PaddingParams>
pub fn get_padding(&self) -> Option<&PaddingParams>
Get the currently set padding parameters
Sourcepub fn get_padding_mut(&mut self) -> Option<&mut PaddingParams>
pub fn get_padding_mut(&mut self) -> Option<&mut PaddingParams>
Get a mutable reference to the currently set padding parameters
pub fn get_vocab(&self, with_added_tokens: bool) -> HashMap<String, u32>
Sourcepub fn get_added_tokens_decoder(&self) -> AHashMap<u32, AddedToken>
pub fn get_added_tokens_decoder(&self) -> AHashMap<u32, AddedToken>
Get the added tokens decoder
Sourcepub fn get_vocab_size(&self, with_added_tokens: bool) -> usize
pub fn get_vocab_size(&self, with_added_tokens: bool) -> usize
Get the size of the vocabulary
Sourcepub fn token_to_id(&self, token: &str) -> Option<u32>
pub fn token_to_id(&self, token: &str) -> Option<u32>
Converts a token in the corresponding id.
Sourcepub fn id_to_token(&self, id: u32) -> Option<String>
pub fn id_to_token(&self, id: u32) -> Option<String>
Converts an id to the corresponding token.
Sourcepub fn set_encode_special_tokens(&mut self, value: bool)
pub fn set_encode_special_tokens(&mut self, value: bool)
set the added vocab’s splitting scheme
Sourcepub fn get_encode_special_tokens(&self) -> bool
pub fn get_encode_special_tokens(&self) -> bool
Get added token value
Sourcepub fn encode_fast<'s, E>(
&self,
input: E,
add_special_tokens: bool,
) -> Result<Encoding, Box<dyn Error + Sync + Send>>where
E: Into<EncodeInput<'s>>,
pub fn encode_fast<'s, E>(
&self,
input: E,
add_special_tokens: bool,
) -> Result<Encoding, Box<dyn Error + Sync + Send>>where
E: Into<EncodeInput<'s>>,
Encode the given input. This method accepts both single sequences, as well as pair
sequences. Also, a sequence can be a string, or already pre-tokenized input directly:
Contrarily to encode, it does not compute offsets
// Sequences:
tokenizer.encode_fast("Single sequence", false);
tokenizer.encode_fast(("Sequence A", "Sequence B"), false);
// Pre-tokenized sequences:
tokenizer.encode_fast(&["Single", "sequence"][..], false);
tokenizer.encode_fast((
&["Sequence", "A"][..],
&["Sequence", "B"][..]
), false);
// or even both types together:
tokenizer.encode_fast(("A complete sequence", &["And", "a", "tokenized"][..]), false);Sourcepub fn encode<'s, E>(
&self,
input: E,
add_special_tokens: bool,
) -> Result<Encoding, Box<dyn Error + Sync + Send>>where
E: Into<EncodeInput<'s>>,
pub fn encode<'s, E>(
&self,
input: E,
add_special_tokens: bool,
) -> Result<Encoding, Box<dyn Error + Sync + Send>>where
E: Into<EncodeInput<'s>>,
Encode the given input. This method accepts both single sequences, as well as pair sequences. Also, a sequence can be a string, or already pre-tokenized input directly:
// Sequences:
tokenizer.encode("Single sequence", false);
tokenizer.encode(("Sequence A", "Sequence B"), false);
// Pre-tokenized sequences:
tokenizer.encode(&["Single", "sequence"][..], false);
tokenizer.encode((
&["Sequence", "A"][..],
&["Sequence", "B"][..]
), false);
// or even both types together:
tokenizer.encode(("A complete sequence", &["And", "a", "tokenized"][..]), false);Sourcepub fn encode_char_offsets<'s, E>(
&self,
input: E,
add_special_tokens: bool,
) -> Result<Encoding, Box<dyn Error + Sync + Send>>where
E: Into<EncodeInput<'s>>,
pub fn encode_char_offsets<'s, E>(
&self,
input: E,
add_special_tokens: bool,
) -> Result<Encoding, Box<dyn Error + Sync + Send>>where
E: Into<EncodeInput<'s>>,
Encode the given input, using offsets relative to chars instead of bytes. This method accepts both single sequences, as well as pair sequences. Also, a sequence can be a string, or already pre-tokenized input directly:
// Sequences:
tokenizer.encode("Single sequence", false);
tokenizer.encode(("Sequence A", "Sequence B"), false);
// Pre-tokenized sequences:
tokenizer.encode(&["Single", "sequence"][..], false);
tokenizer.encode((
&["Sequence", "A"][..],
&["Sequence", "B"][..]
), false);
// or even both types together:
tokenizer.encode(("A complete sequence", &["And", "a", "tokenized"][..]), false);Sourcepub fn decode(
&self,
ids: &[u32],
skip_special_tokens: bool,
) -> Result<String, Box<dyn Error + Sync + Send>>
pub fn decode( &self, ids: &[u32], skip_special_tokens: bool, ) -> Result<String, Box<dyn Error + Sync + Send>>
Decode the given ids, back to a String
Sourcepub fn decode_stream(
&self,
skip_special_tokens: bool,
) -> DecodeStream<'_, M, N, PT, PP, D>
pub fn decode_stream( &self, skip_special_tokens: bool, ) -> DecodeStream<'_, M, N, PT, PP, D>
Decode the given ids, back to a String
See DecodeStream
Sourcepub fn add_special_tokens(&mut self, tokens: &[AddedToken]) -> usize
pub fn add_special_tokens(&mut self, tokens: &[AddedToken]) -> usize
Register the given tokens as special tokens. This is especially useful for removing these special tokens while decoding
Sourcepub fn add_tokens(&mut self, tokens: &[AddedToken]) -> usize
pub fn add_tokens(&mut self, tokens: &[AddedToken]) -> usize
Add the given tokens to the added vocabulary
Sourcepub fn post_process(
&self,
encoding: Encoding,
pair_encoding: Option<Encoding>,
add_special_tokens: bool,
) -> Result<Encoding, Box<dyn Error + Sync + Send>>
pub fn post_process( &self, encoding: Encoding, pair_encoding: Option<Encoding>, add_special_tokens: bool, ) -> Result<Encoding, Box<dyn Error + Sync + Send>>
Post processing logic, handling the case where there is no PostProcessor set
Sourcepub fn encode_batch<'s, E>(
&self,
inputs: Vec<E>,
add_special_tokens: bool,
) -> Result<Vec<Encoding>, Box<dyn Error + Sync + Send>>
pub fn encode_batch<'s, E>( &self, inputs: Vec<E>, add_special_tokens: bool, ) -> Result<Vec<Encoding>, Box<dyn Error + Sync + Send>>
Encode all the sentences in parallel, using multiple threads
Sourcepub fn encode_batch_char_offsets<'s, E>(
&self,
inputs: Vec<E>,
add_special_tokens: bool,
) -> Result<Vec<Encoding>, Box<dyn Error + Sync + Send>>
pub fn encode_batch_char_offsets<'s, E>( &self, inputs: Vec<E>, add_special_tokens: bool, ) -> Result<Vec<Encoding>, Box<dyn Error + Sync + Send>>
Encode all the sentences in parallel, using multiple threads.
The offsets on each Encoding will be relative to chars instead of bytes.
Sourcepub fn encode_batch_fast<'s, E>(
&self,
inputs: Vec<E>,
add_special_tokens: bool,
) -> Result<Vec<Encoding>, Box<dyn Error + Sync + Send>>
pub fn encode_batch_fast<'s, E>( &self, inputs: Vec<E>, add_special_tokens: bool, ) -> Result<Vec<Encoding>, Box<dyn Error + Sync + Send>>
Encode all the sentences in parallel, using multiple threads
Sourcepub fn decode_batch(
&self,
sentences: &[&[u32]],
skip_special_tokens: bool,
) -> Result<Vec<String>, Box<dyn Error + Sync + Send>>
pub fn decode_batch( &self, sentences: &[&[u32]], skip_special_tokens: bool, ) -> Result<Vec<String>, Box<dyn Error + Sync + Send>>
Decode all sentences in parallel
Sourcepub fn train_from_files<T>(
&mut self,
trainer: &mut T,
files: Vec<String>,
) -> Result<&mut TokenizerImpl<M, N, PT, PP, D>, Box<dyn Error + Sync + Send>>
pub fn train_from_files<T>( &mut self, trainer: &mut T, files: Vec<String>, ) -> Result<&mut TokenizerImpl<M, N, PT, PP, D>, Box<dyn Error + Sync + Send>>
Train our Model from files
Sourcepub fn train<T, I, S>(
&mut self,
trainer: &mut T,
sequences: I,
) -> Result<&mut TokenizerImpl<M, N, PT, PP, D>, Box<dyn Error + Sync + Send>>
pub fn train<T, I, S>( &mut self, trainer: &mut T, sequences: I, ) -> Result<&mut TokenizerImpl<M, N, PT, PP, D>, Box<dyn Error + Sync + Send>>
Train our Model, using the given Trainer and iterator
Trait Implementations§
Auto Trait Implementations§
impl !Freeze for Tokenizer
impl !RefUnwindSafe for Tokenizer
impl Send for Tokenizer
impl Sync for Tokenizer
impl Unpin for Tokenizer
impl !UnwindSafe for Tokenizer
Blanket Implementations§
Source§impl<T> BorrowMut<T> for Twhere
T: ?Sized,
impl<T> BorrowMut<T> for Twhere
T: ?Sized,
Source§fn borrow_mut(&mut self) -> &mut T
fn borrow_mut(&mut self) -> &mut T
Source§impl<T> IntoEither for T
impl<T> IntoEither for T
Source§fn into_either(self, into_left: bool) -> Either<Self, Self>
fn into_either(self, into_left: bool) -> Either<Self, Self>
self into a Left variant of Either<Self, Self>
if into_left is true.
Converts self into a Right variant of Either<Self, Self>
otherwise. Read moreSource§fn into_either_with<F>(self, into_left: F) -> Either<Self, Self>
fn into_either_with<F>(self, into_left: F) -> Either<Self, Self>
self into a Left variant of Either<Self, Self>
if into_left(&self) returns true.
Converts self into a Right variant of Either<Self, Self>
otherwise. Read more