Struct Tokenizer

Source

pub struct Tokenizer { /* private fields */ }

Expand description

Wrapper around tokenizers::Tokenizer and minijinja::Environment providing more utilities.

Implementations§

Source §

pub fn apply_chat_template<'a, I, R, T>( &'a mut self, model_template: String, args: ApplyChatTemplateArgs<'a, I, R, T>, ) -> Result<Vec<String>, Error>
where I: IntoIterator<Item = Chat<'a, R, T>>, R: Serialize + 'a, T: Serialize + ToString + 'a,

Source

pub fn apply_chat_template_and_encode<'a, I, R, T>( &mut self, model_template: String, args: ApplyChatTemplateArgs<'a, I, R, T>, ) -> Result<Vec<Encoding>, Error>
where I: IntoIterator<Item = Chat<'a, R, T>>, R: Serialize + 'a, T: Serialize + ToString + 'a,

Methods from Deref<Target = TokenizerImpl<ModelWrapper, NormalizerWrapper, PreTokenizerWrapper, PostProcessorWrapper, DecoderWrapper>>§

Source

pub fn with_normalizer( &mut self, normalizer: Option<impl Into<N>>, ) -> &mut TokenizerImpl<M, N, PT, PP, D>

Set the normalizer

Source

pub fn get_normalizer(&self) -> Option<&N>

Get the normalizer

Source

pub fn with_pre_tokenizer( &mut self, pre_tokenizer: Option<impl Into<PT>>, ) -> &mut TokenizerImpl<M, N, PT, PP, D>

Set the pre tokenizer

Source

pub fn get_pre_tokenizer(&self) -> Option<&PT>

Get the pre tokenizer

Source

pub fn with_post_processor( &mut self, post_processor: Option<impl Into<PP>>, ) -> &mut TokenizerImpl<M, N, PT, PP, D>

Set the post processor

Source

pub fn get_post_processor(&self) -> Option<&PP>

Get the post processor

Source

pub fn with_decoder( &mut self, decoder: Option<impl Into<D>>, ) -> &mut TokenizerImpl<M, N, PT, PP, D>

Set the decoder

Source

pub fn get_decoder(&self) -> Option<&D>

Get the decoder

Source

pub fn with_model( &mut self, model: impl Into<M>, ) -> &mut TokenizerImpl<M, N, PT, PP, D>

Set the model

Source

pub fn get_model(&self) -> &M

Get the model

Source

pub fn with_added_vocabulary( &mut self, added_vocabulary: AddedVocabulary, ) -> &mut TokenizerImpl<M, N, PT, PP, D>

Set the added vocabulary.

Source

pub fn get_added_vocabulary(&self) -> &AddedVocabulary

Get the added vocabulary

Source

pub fn with_truncation( &mut self, trunc: Option<TruncationParams>, ) -> Result<&mut TokenizerImpl<M, N, PT, PP, D>, Box<dyn Error + Sync + Send>>

Set the truncation parameters

Fails if stride is too high relative to max_length and post_processor.added_tokens()

Source

pub fn get_truncation(&self) -> Option<&TruncationParams>

Get the currently set truncation parameters

Source

pub fn get_truncation_mut(&mut self) -> Option<&mut TruncationParams>

Get a mutable reference to the currently set truncation parameters

Source

pub fn with_padding( &mut self, padding: Option<PaddingParams>, ) -> &mut TokenizerImpl<M, N, PT, PP, D>

Set the padding parameters

Source

pub fn get_padding(&self) -> Option<&PaddingParams>

Get the currently set padding parameters

Source

pub fn get_padding_mut(&mut self) -> Option<&mut PaddingParams>

Get a mutable reference to the currently set padding parameters

Source

pub fn get_vocab(&self, with_added_tokens: bool) -> HashMap<String, u32>

Source

pub fn get_added_tokens_decoder(&self) -> AHashMap<u32, AddedToken>

Get the added tokens decoder

Source

pub fn get_vocab_size(&self, with_added_tokens: bool) -> usize

Get the size of the vocabulary

Source

pub fn token_to_id(&self, token: &str) -> Option<u32>

Converts a token in the corresponding id.

Source

pub fn id_to_token(&self, id: u32) -> Option<String>

Converts an id to the corresponding token.

Source

pub fn set_encode_special_tokens(&mut self, value: bool)

set the added vocab’s splitting scheme

Source

pub fn get_encode_special_tokens(&self) -> bool

Get added token value

Source

pub fn encode_fast<'s, E>( &self, input: E, add_special_tokens: bool, ) -> Result<Encoding, Box<dyn Error + Sync + Send>>
where E: Into<EncodeInput<'s>>,

Encode the given input. This method accepts both single sequences, as well as pair sequences. Also, a sequence can be a string, or already pre-tokenized input directly: Contrarily to encode, it does not compute offsets

// Sequences:
tokenizer.encode_fast("Single sequence", false);
tokenizer.encode_fast(("Sequence A", "Sequence B"), false);

// Pre-tokenized sequences:
tokenizer.encode_fast(&["Single", "sequence"][..], false);
tokenizer.encode_fast((
    &["Sequence", "A"][..],
    &["Sequence", "B"][..]
), false);

// or even both types together:
tokenizer.encode_fast(("A complete sequence", &["And", "a", "tokenized"][..]), false);

Source

pub fn encode<'s, E>( &self, input: E, add_special_tokens: bool, ) -> Result<Encoding, Box<dyn Error + Sync + Send>>
where E: Into<EncodeInput<'s>>,

Encode the given input. This method accepts both single sequences, as well as pair sequences. Also, a sequence can be a string, or already pre-tokenized input directly:

// Sequences:
tokenizer.encode("Single sequence", false);
tokenizer.encode(("Sequence A", "Sequence B"), false);

// Pre-tokenized sequences:
tokenizer.encode(&["Single", "sequence"][..], false);
tokenizer.encode((
    &["Sequence", "A"][..],
    &["Sequence", "B"][..]
), false);

// or even both types together:
tokenizer.encode(("A complete sequence", &["And", "a", "tokenized"][..]), false);

Source

pub fn encode_char_offsets<'s, E>( &self, input: E, add_special_tokens: bool, ) -> Result<Encoding, Box<dyn Error + Sync + Send>>
where E: Into<EncodeInput<'s>>,

Encode the given input, using offsets relative to chars instead of bytes. This method accepts both single sequences, as well as pair sequences. Also, a sequence can be a string, or already pre-tokenized input directly:

// Sequences:
tokenizer.encode("Single sequence", false);
tokenizer.encode(("Sequence A", "Sequence B"), false);

// Pre-tokenized sequences:
tokenizer.encode(&["Single", "sequence"][..], false);
tokenizer.encode((
    &["Sequence", "A"][..],
    &["Sequence", "B"][..]
), false);

// or even both types together:
tokenizer.encode(("A complete sequence", &["And", "a", "tokenized"][..]), false);

Source

pub fn decode( &self, ids: &[u32], skip_special_tokens: bool, ) -> Result<String, Box<dyn Error + Sync + Send>>

Decode the given ids, back to a String

Source

pub fn decode_stream( &self, skip_special_tokens: bool, ) -> DecodeStream<'_, M, N, PT, PP, D>

Decode the given ids, back to a String See DecodeStream

Source

pub fn add_special_tokens(&mut self, tokens: &[AddedToken]) -> usize

Register the given tokens as special tokens. This is especially useful for removing these special tokens while decoding

Source

pub fn add_tokens(&mut self, tokens: &[AddedToken]) -> usize

Add the given tokens to the added vocabulary

Source

pub fn post_process( &self, encoding: Encoding, pair_encoding: Option<Encoding>, add_special_tokens: bool, ) -> Result<Encoding, Box<dyn Error + Sync + Send>>

Post processing logic, handling the case where there is no PostProcessor set

Source

pub fn encode_batch<'s, E>( &self, inputs: Vec<E>, add_special_tokens: bool, ) -> Result<Vec<Encoding>, Box<dyn Error + Sync + Send>>
where E: Into<EncodeInput<'s>> + Send,

Encode all the sentences in parallel, using multiple threads

Source

pub fn encode_batch_char_offsets<'s, E>( &self, inputs: Vec<E>, add_special_tokens: bool, ) -> Result<Vec<Encoding>, Box<dyn Error + Sync + Send>>
where E: Into<EncodeInput<'s>> + Send,

Encode all the sentences in parallel, using multiple threads. The offsets on each Encoding will be relative to chars instead of bytes.

Source

pub fn encode_batch_fast<'s, E>( &self, inputs: Vec<E>, add_special_tokens: bool, ) -> Result<Vec<Encoding>, Box<dyn Error + Sync + Send>>
where E: Into<EncodeInput<'s>> + Send,

Encode all the sentences in parallel, using multiple threads

Source

pub fn decode_batch( &self, sentences: &[&[u32]], skip_special_tokens: bool, ) -> Result<Vec<String>, Box<dyn Error + Sync + Send>>
where M: Send + Sync,

Decode all sentences in parallel

Source

pub fn train_from_files<T>( &mut self, trainer: &mut T, files: Vec<String>, ) -> Result<&mut TokenizerImpl<M, N, PT, PP, D>, Box<dyn Error + Sync + Send>>
where T: Trainer<Model = M> + Sync,

Train our Model from files

Source

pub fn train<T, I, S>( &mut self, trainer: &mut T, sequences: I, ) -> Result<&mut TokenizerImpl<M, N, PT, PP, D>, Box<dyn Error + Sync + Send>>
where T: Trainer<Model = M> + Sync, I: Iterator<Item = S> + Send, S: AsRef<str> + Send,

Train our Model, using the given Trainer and iterator

Source

pub fn to_string( &self, pretty: bool, ) -> Result<String, Box<dyn Error + Sync + Send>>

Serialize the current tokenizer as a String

Source

pub fn save( &self, path: P, pretty: bool, ) -> Result<(), Box<dyn Error + Sync + Send>>
where P: AsRef<Path>,

Save the current tokenizer at the given path

Trait Implementations§

Source §

impl Deref for Tokenizer

Source §

type Target = Tokenizer

The resulting type after dereferencing.

Source §

fn deref(&self) -> &Self::Target

Dereferences the value.

Source §

impl DerefMut for Tokenizer

Source §

fn deref_mut(&mut self) -> &mut Self::Target

Mutably dereferences the value.

Source §

impl FromStr for Tokenizer

Source §

type Err = Box<dyn Error + Sync + Send>

The associated error which can be returned from parsing.

Source §

fn from_str(s: &str) -> Result<Self, Self::Err>

Parses a string s to return a value of this type. Read more

Auto Trait Implementations§

§

impl !Freeze for Tokenizer

§

impl !RefUnwindSafe for Tokenizer

§

impl Send for Tokenizer

§

impl Sync for Tokenizer

§

impl Unpin for Tokenizer

§

impl !UnwindSafe for Tokenizer

Blanket Implementations§

Source §

impl<T> Any for T
where T: 'static + ?Sized,

Source §

fn type_id(&self) -> TypeId

Gets the TypeId of self. Read more

Source §

impl<T> Borrow<T> for T
where T: ?Sized,

Source §

fn borrow(&self) -> &T

Immutably borrows from an owned value. Read more

Source §

impl<T> BorrowMut<T> for T
where T: ?Sized,

Source §

fn borrow_mut(&mut self) -> &mut T

Mutably borrows from an owned value. Read more

Source §

impl<T> From<T> for T

Source §

fn from(t: T) -> T

Returns the argument unchanged.

Source §

impl<T, U> Into for T
where U: From<T>,

Source §

fn into(self) -> U

Calls U::from(self).

That is, this conversion is whatever the implementation of From<T> for U chooses to do.

Source §

impl<T> IntoEither for T

Source §

fn into_either(self, into_left: bool) -> Either<Self, Self>

Converts self into a Left variant of Either<Self, Self> if into_left is true. Converts self into a Right variant of Either<Self, Self> otherwise. Read more

Source §

fn into_either_with<F>(self, into_left: F) -> Either<Self, Self>
where F: FnOnce(&Self) -> bool,

Converts self into a Left variant of Either<Self, Self> if into_left(&self) returns true. Converts self into a Right variant of Either<Self, Self> otherwise. Read more

Source §

impl<T> Pointable for T

Source §

const ALIGN: usize

The alignment of pointer.

Source §

type Init = T

The type for initializers.

Source §

unsafe fn init(init: <T as Pointable>::Init) -> usize

Initializes a with the given initializer. Read more

Source §

unsafe fn deref<'a>(ptr: usize) -> &'a T

Dereferences the given pointer. Read more

Source §

unsafe fn deref_mut<'a>(ptr: usize) -> &'a mut T

Mutably dereferences the given pointer. Read more

Source §

unsafe fn drop(ptr: usize)

Drops the object pointed to by the given pointer. Read more

Source §

impl<P, T> Receiver for P
where P: Deref<Target = T> + ?Sized, T: ?Sized,

Source §

type Target = T

🔬This is a nightly-only experimental API. (arbitrary_self_types)

The target type on which the method may be called.

Source §

impl<T, U> TryFrom for T
where U: Into<T>,

Source §

type Error = Infallible

The type returned in the event of a conversion error.

Source §

fn try_from(value: U) -> Result<T, <T as TryFrom>::Error>

Performs the conversion.

Source §

impl<T, U> TryInto for T
where U: TryFrom<T>,

Source §

type Error = >::Error

The type returned in the event of a conversion error.

Source §

fn try_into(self) -> Result<U, >::Error>

Performs the conversion.

Source §

impl<V, T> VZip<V> for T
where V: MultiLane<T>,

Source §

Struct Tokenizer Copy item path

Implementations§

impl Tokenizer

pub fn from_tokenizer(tokenizer: Tokenizer) -> Self

pub fn from_file(file: impl AsRef<Path>) -> Result<Self>

pub fn from_bytes(bytes: impl AsRef<[u8]>) -> Result<Self>

pub fn apply_chat_template<'a, I, R, T>( &'a mut self, model_template: String, args: ApplyChatTemplateArgs<'a, I, R, T>, ) -> Result<Vec<String>, Error>where I: IntoIterator<Item = Chat<'a, R, T>>, R: Serialize + 'a, T: Serialize + ToString + 'a,

pub fn apply_chat_template_and_encode<'a, I, R, T>( &mut self, model_template: String, args: ApplyChatTemplateArgs<'a, I, R, T>, ) -> Result<Vec<Encoding>, Error>where I: IntoIterator<Item = Chat<'a, R, T>>, R: Serialize + 'a, T: Serialize + ToString + 'a,

Methods from Deref<Target = TokenizerImpl<ModelWrapper, NormalizerWrapper, PreTokenizerWrapper, PostProcessorWrapper, DecoderWrapper>>§

pub fn with_normalizer( &mut self, normalizer: Option<impl Into<N>>, ) -> &mut TokenizerImpl<M, N, PT, PP, D>

pub fn get_normalizer(&self) -> Option<&N>

pub fn with_pre_tokenizer( &mut self, pre_tokenizer: Option<impl Into<PT>>, ) -> &mut TokenizerImpl<M, N, PT, PP, D>

pub fn get_pre_tokenizer(&self) -> Option<&PT>

pub fn with_post_processor( &mut self, post_processor: Option<impl Into<PP>>, ) -> &mut TokenizerImpl<M, N, PT, PP, D>

pub fn get_post_processor(&self) -> Option<&PP>

pub fn with_decoder( &mut self, decoder: Option<impl Into<D>>, ) -> &mut TokenizerImpl<M, N, PT, PP, D>

pub fn get_decoder(&self) -> Option<&D>

pub fn with_model( &mut self, model: impl Into<M>, ) -> &mut TokenizerImpl<M, N, PT, PP, D>

pub fn get_model(&self) -> &M

pub fn with_added_vocabulary( &mut self, added_vocabulary: AddedVocabulary, ) -> &mut TokenizerImpl<M, N, PT, PP, D>

pub fn get_added_vocabulary(&self) -> &AddedVocabulary

pub fn with_truncation( &mut self, trunc: Option<TruncationParams>, ) -> Result<&mut TokenizerImpl<M, N, PT, PP, D>, Box<dyn Error + Sync + Send>>

pub fn get_truncation(&self) -> Option<&TruncationParams>

pub fn get_truncation_mut(&mut self) -> Option<&mut TruncationParams>

pub fn with_padding( &mut self, padding: Option<PaddingParams>, ) -> &mut TokenizerImpl<M, N, PT, PP, D>

pub fn get_padding(&self) -> Option<&PaddingParams>

pub fn get_padding_mut(&mut self) -> Option<&mut PaddingParams>

pub fn get_vocab(&self, with_added_tokens: bool) -> HashMap<String, u32>

pub fn get_added_tokens_decoder(&self) -> AHashMap<u32, AddedToken>

pub fn get_vocab_size(&self, with_added_tokens: bool) -> usize

pub fn token_to_id(&self, token: &str) -> Option<u32>

pub fn id_to_token(&self, id: u32) -> Option<String>

pub fn set_encode_special_tokens(&mut self, value: bool)

pub fn get_encode_special_tokens(&self) -> bool

pub fn encode_fast<'s, E>( &self, input: E, add_special_tokens: bool, ) -> Result<Encoding, Box<dyn Error + Sync + Send>>where E: Into<EncodeInput<'s>>,

pub fn encode<'s, E>( &self, input: E, add_special_tokens: bool, ) -> Result<Encoding, Box<dyn Error + Sync + Send>>where E: Into<EncodeInput<'s>>,

pub fn encode_char_offsets<'s, E>( &self, input: E, add_special_tokens: bool, ) -> Result<Encoding, Box<dyn Error + Sync + Send>>where E: Into<EncodeInput<'s>>,

pub fn decode( &self, ids: &[u32], skip_special_tokens: bool, ) -> Result<String, Box<dyn Error + Sync + Send>>

pub fn decode_stream( &self, skip_special_tokens: bool, ) -> DecodeStream<'_, M, N, PT, PP, D>

pub fn add_special_tokens(&mut self, tokens: &[AddedToken]) -> usize

pub fn add_tokens(&mut self, tokens: &[AddedToken]) -> usize

pub fn post_process( &self, encoding: Encoding, pair_encoding: Option<Encoding>, add_special_tokens: bool, ) -> Result<Encoding, Box<dyn Error + Sync + Send>>

pub fn encode_batch<'s, E>( &self, inputs: Vec<E>, add_special_tokens: bool, ) -> Result<Vec<Encoding>, Box<dyn Error + Sync + Send>>where E: Into<EncodeInput<'s>> + Send,

pub fn encode_batch_char_offsets<'s, E>( &self, inputs: Vec<E>, add_special_tokens: bool, ) -> Result<Vec<Encoding>, Box<dyn Error + Sync + Send>>where E: Into<EncodeInput<'s>> + Send,

pub fn encode_batch_fast<'s, E>( &self, inputs: Vec<E>, add_special_tokens: bool, ) -> Result<Vec<Encoding>, Box<dyn Error + Sync + Send>>where E: Into<EncodeInput<'s>> + Send,

pub fn decode_batch( &self, sentences: &[&[u32]], skip_special_tokens: bool, ) -> Result<Vec<String>, Box<dyn Error + Sync + Send>>where M: Send + Sync,

pub fn train_from_files<T>( &mut self, trainer: &mut T, files: Vec<String>, ) -> Result<&mut TokenizerImpl<M, N, PT, PP, D>, Box<dyn Error + Sync + Send>>where T: Trainer<Model = M> + Sync,

pub fn train<T, I, S>( &mut self, trainer: &mut T, sequences: I, ) -> Result<&mut TokenizerImpl<M, N, PT, PP, D>, Box<dyn Error + Sync + Send>>where T: Trainer<Model = M> + Sync, I: Iterator<Item = S> + Send, S: AsRef<str> + Send,

pub fn to_string( &self, pretty: bool, ) -> Result<String, Box<dyn Error + Sync + Send>>

pub fn save<P>( &self, path: P, pretty: bool, ) -> Result<(), Box<dyn Error + Sync + Send>>where P: AsRef<Path>,

Trait Implementations§

impl Deref for Tokenizer

type Target = Tokenizer

fn deref(&self) -> &Self::Target

impl DerefMut for Tokenizer

fn deref_mut(&mut self) -> &mut Self::Target

impl FromStr for Tokenizer

type Err = Box<dyn Error + Sync + Send>

fn from_str(s: &str) -> Result<Self, Self::Err>

Auto Trait Implementations§

impl !Freeze for Tokenizer

impl !RefUnwindSafe for Tokenizer

impl Send for Tokenizer

impl Sync for Tokenizer

impl Unpin for Tokenizer

impl !UnwindSafe for Tokenizer

Blanket Implementations§

impl<T> Any for Twhere T: 'static + ?Sized,

fn type_id(&self) -> TypeId

impl<T> Borrow<T> for Twhere T: ?Sized,

fn borrow(&self) -> &T

impl<T> BorrowMut<T> for Twhere T: ?Sized,

fn borrow_mut(&mut self) -> &mut T

impl<T> From<T> for T

fn from(t: T) -> T

impl<T, U> Into<U> for Twhere U: From<T>,

fn into(self) -> U

impl<T> IntoEither for T

fn into_either(self, into_left: bool) -> Either<Self, Self>

fn into_either_with<F>(self, into_left: F) -> Either<Self, Self>where F: FnOnce(&Self) -> bool,

Struct Tokenizer

pub fn apply_chat_template<'a, I, R, T>( &'a mut self, model_template: String, args: ApplyChatTemplateArgs<'a, I, R, T>, ) -> Result<Vec<String>, Error>
where I: IntoIterator<Item = Chat<'a, R, T>>, R: Serialize + 'a, T: Serialize + ToString + 'a,

pub fn apply_chat_template_and_encode<'a, I, R, T>( &mut self, model_template: String, args: ApplyChatTemplateArgs<'a, I, R, T>, ) -> Result<Vec<Encoding>, Error>
where I: IntoIterator<Item = Chat<'a, R, T>>, R: Serialize + 'a, T: Serialize + ToString + 'a,

pub fn encode_fast<'s, E>( &self, input: E, add_special_tokens: bool, ) -> Result<Encoding, Box<dyn Error + Sync + Send>>
where E: Into<EncodeInput<'s>>,

pub fn encode<'s, E>( &self, input: E, add_special_tokens: bool, ) -> Result<Encoding, Box<dyn Error + Sync + Send>>
where E: Into<EncodeInput<'s>>,

pub fn encode_char_offsets<'s, E>( &self, input: E, add_special_tokens: bool, ) -> Result<Encoding, Box<dyn Error + Sync + Send>>
where E: Into<EncodeInput<'s>>,

pub fn encode_batch<'s, E>( &self, inputs: Vec<E>, add_special_tokens: bool, ) -> Result<Vec<Encoding>, Box<dyn Error + Sync + Send>>
where E: Into<EncodeInput<'s>> + Send,

pub fn encode_batch_char_offsets<'s, E>( &self, inputs: Vec<E>, add_special_tokens: bool, ) -> Result<Vec<Encoding>, Box<dyn Error + Sync + Send>>
where E: Into<EncodeInput<'s>> + Send,

pub fn encode_batch_fast<'s, E>( &self, inputs: Vec<E>, add_special_tokens: bool, ) -> Result<Vec<Encoding>, Box<dyn Error + Sync + Send>>
where E: Into<EncodeInput<'s>> + Send,

pub fn decode_batch( &self, sentences: &[&[u32]], skip_special_tokens: bool, ) -> Result<Vec<String>, Box<dyn Error + Sync + Send>>
where M: Send + Sync,

pub fn train_from_files<T>( &mut self, trainer: &mut T, files: Vec<String>, ) -> Result<&mut TokenizerImpl<M, N, PT, PP, D>, Box<dyn Error + Sync + Send>>
where T: Trainer<Model = M> + Sync,

pub fn train<T, I, S>( &mut self, trainer: &mut T, sequences: I, ) -> Result<&mut TokenizerImpl<M, N, PT, PP, D>, Box<dyn Error + Sync + Send>>
where T: Trainer<Model = M> + Sync, I: Iterator<Item = S> + Send, S: AsRef<str> + Send,

pub fn save<P>( &self, path: P, pretty: bool, ) -> Result<(), Box<dyn Error + Sync + Send>>
where P: AsRef<Path>,

impl<T> Any for T
where T: 'static + ?Sized,

impl<T> Borrow<T> for T
where T: ?Sized,

impl<T> BorrowMut<T> for T
where T: ?Sized,

impl<T, U> Into<U> for T
where U: From<T>,

fn into_either_with<F>(self, into_left: F) -> Either<Self, Self>
where F: FnOnce(&Self) -> bool,

impl<P, T> Receiver for P
where P: Deref<Target = T> + ?Sized, T: ?Sized,

impl<T, U> TryFrom<U> for T
where U: Into<T>,

impl<T, U> TryInto<U> for T
where U: TryFrom<T>,

impl<V, T> VZip<V> for T
where V: MultiLane<T>,