Struct tokenizers::tokenizer::TokenizerImpl

source ·
pub struct TokenizerImpl<M, N, PT, PP, D> { /* private fields */ }
Expand description

A Tokenizer is capable of encoding/decoding any text.

Implementations§

source§

impl<M, N, PT, PP, D> TokenizerImpl<M, N, PT, PP, D>
where M: Model, N: Normalizer, PT: PreTokenizer, PP: PostProcessor, D: Decoder,

source

pub fn new(model: M) -> Self

Instantiate a new Tokenizer, with the given Model

source

pub fn with_normalizer(&mut self, normalizer: impl Into<N>) -> &mut Self

Set the normalizer

source

pub fn get_normalizer(&self) -> Option<&N>

Get the normalizer

source

pub fn with_pre_tokenizer(&mut self, pre_tokenizer: impl Into<PT>) -> &mut Self

Set the pre tokenizer

source

pub fn get_pre_tokenizer(&self) -> Option<&PT>

Get the pre tokenizer

source

pub fn with_post_processor( &mut self, post_processor: impl Into<PP> ) -> &mut Self

Set the post processor

source

pub fn get_post_processor(&self) -> Option<&PP>

Get the post processor

source

pub fn with_decoder(&mut self, decoder: impl Into<D>) -> &mut Self

Set the decoder

source

pub fn get_decoder(&self) -> Option<&D>

Get the decoder

source

pub fn with_model(&mut self, model: impl Into<M>) -> &mut Self

Set the model

source

pub fn get_model(&self) -> &M

Get the model

source

pub fn with_added_vocabulary( &mut self, added_vocabulary: AddedVocabulary ) -> &mut Self

Set the added vocabulary.

source

pub fn get_added_vocabulary(&self) -> &AddedVocabulary

Get the added vocabulary

source

pub fn with_truncation( &mut self, trunc: Option<TruncationParams> ) -> Result<&mut Self>

Set the truncation parameters

Fails if stride is too high relative to max_length and post_processor.added_tokens()

source

pub fn get_truncation(&self) -> Option<&TruncationParams>

Get the currently set truncation parameters

source

pub fn get_truncation_mut(&mut self) -> Option<&mut TruncationParams>

Get a mutable reference to the currently set truncation parameters

source

pub fn with_padding(&mut self, padding: Option<PaddingParams>) -> &mut Self

Set the padding parameters

source

pub fn get_padding(&self) -> Option<&PaddingParams>

Get the currently set padding parameters

source

pub fn get_padding_mut(&mut self) -> Option<&mut PaddingParams>

Get a mutable reference to the currently set padding parameters

source

pub fn get_vocab(&self, with_added_tokens: bool) -> HashMap<String, u32>

Get the vocabulary

source

pub fn get_added_tokens_decoder(&self) -> HashMap<u32, AddedToken>

Get the added tokens decoder

source

pub fn get_vocab_size(&self, with_added_tokens: bool) -> usize

Get the size of the vocabulary

source

pub fn token_to_id(&self, token: &str) -> Option<u32>

Converts a token in the corresponding id.

source

pub fn id_to_token(&self, id: u32) -> Option<String>

Converts an id to the corresponding token.

source

pub fn set_encode_special_tokens(&mut self, value: bool)

set the added bocab’s splitting scheme

source

pub fn get_encode_special_tokens(&self) -> bool

Get added token value

source

pub fn encode<'s, E>( &self, input: E, add_special_tokens: bool ) -> Result<Encoding>
where E: Into<EncodeInput<'s>>,

Encode the given input. This method accepts both single sequences, as well as pair sequences. Also, a sequence can be a string, or already pre-tokenized input directly:

// Sequences:
tokenizer.encode("Single sequence", false);
tokenizer.encode(("Sequence A", "Sequence B"), false);

// Pre-tokenized sequences:
tokenizer.encode(&["Single", "sequence"][..], false);
tokenizer.encode((
    &["Sequence", "A"][..],
    &["Sequence", "B"][..]
), false);

// or even both types together:
tokenizer.encode(("A complete sequence", &["And", "a", "tokenized"][..]), false);
source

pub fn encode_char_offsets<'s, E>( &self, input: E, add_special_tokens: bool ) -> Result<Encoding>
where E: Into<EncodeInput<'s>>,

Encode the given input, using offsets relative to chars instead of bytes. This method accepts both single sequences, as well as pair sequences. Also, a sequence can be a string, or already pre-tokenized input directly:

// Sequences:
tokenizer.encode("Single sequence", false);
tokenizer.encode(("Sequence A", "Sequence B"), false);

// Pre-tokenized sequences:
tokenizer.encode(&["Single", "sequence"][..], false);
tokenizer.encode((
    &["Sequence", "A"][..],
    &["Sequence", "B"][..]
), false);

// or even both types together:
tokenizer.encode(("A complete sequence", &["And", "a", "tokenized"][..]), false);
source

pub fn decode(&self, ids: &[u32], skip_special_tokens: bool) -> Result<String>

Decode the given ids, back to a String

source§

impl<M, N, PT, PP, D> TokenizerImpl<M, N, PT, PP, D>
where N: Normalizer, M: Model,

source

pub fn add_special_tokens(&mut self, tokens: &[AddedToken]) -> usize

Register the given tokens as special tokens. This is especially useful for removing these special tokens while decoding

source

pub fn add_tokens(&mut self, tokens: &[AddedToken]) -> usize

Add the given tokens to the added vocabulary

source§

impl<M, N, PT, PP, D> TokenizerImpl<M, N, PT, PP, D>
where PP: PostProcessor,

source

pub fn post_process( &self, encoding: Encoding, pair_encoding: Option<Encoding>, add_special_tokens: bool ) -> Result<Encoding>

Post processing logic, handling the case where there is no PostProcessor set

source§

impl<M, N, PT, PP, D> TokenizerImpl<M, N, PT, PP, D>
where M: Model + Send + Sync, N: Normalizer + Send + Sync, PT: PreTokenizer + Send + Sync, PP: PostProcessor + Send + Sync, D: Decoder + Send + Sync,

source

pub fn encode_batch<'s, E>( &self, inputs: Vec<E>, add_special_tokens: bool ) -> Result<Vec<Encoding>>
where E: Into<EncodeInput<'s>> + Send,

Encode all the sentences in parallel, using multiple threads

source

pub fn encode_batch_char_offsets<'s, E>( &self, inputs: Vec<E>, add_special_tokens: bool ) -> Result<Vec<Encoding>>
where E: Into<EncodeInput<'s>> + Send,

Encode all the sentences in parallel, using multiple threads. The offsets on each Encoding will be relative to chars instead of bytes.

source

pub fn decode_batch( &self, sentences: &[&[u32]], skip_special_tokens: bool ) -> Result<Vec<String>>
where M: Send + Sync,

Decode all sentences in parallel

source

pub fn train_from_files<T>( &mut self, trainer: &mut T, files: Vec<String> ) -> Result<&mut Self>
where T: Trainer<Model = M> + Sync,

Train our Model from files

source

pub fn train<T, I, S>( &mut self, trainer: &mut T, sequences: I ) -> Result<&mut Self>
where T: Trainer<Model = M> + Sync, I: Iterator<Item = S> + Send, S: AsRef<str> + Send,

Train our Model, using the given Trainer and iterator

source§

impl<M, N, PT, PP, D> TokenizerImpl<M, N, PT, PP, D>

source

pub fn from_file<P: AsRef<Path>>(file: P) -> Result<Self>

Instantiate a new Tokenizer from the given file

source§

impl<M, N, PT, PP, D> TokenizerImpl<M, N, PT, PP, D>

source

pub fn from_bytes<P: AsRef<[u8]>>(bytes: P) -> Result<Self>

Instantiate a new Tokenizer from bytes

source§

impl<M, N, PT, PP, D> TokenizerImpl<M, N, PT, PP, D>
where M: Serialize, N: Serialize, PT: Serialize, PP: Serialize, D: Serialize,

source

pub fn to_string(&self, pretty: bool) -> Result<String>

Serialize the current tokenizer as a String

source

pub fn save<P: AsRef<Path>>(&self, path: P, pretty: bool) -> Result<()>

Save the current tokenizer at the given path

Trait Implementations§

source§

impl<M: Clone, N: Clone, PT: Clone, PP: Clone, D: Clone> Clone for TokenizerImpl<M, N, PT, PP, D>

source§

fn clone(&self) -> TokenizerImpl<M, N, PT, PP, D>

Returns a copy of the value. Read more
1.0.0 · source§

fn clone_from(&mut self, source: &Self)

Performs copy-assignment from source. Read more
source§

impl<M: Debug, N: Debug, PT: Debug, PP: Debug, D: Debug> Debug for TokenizerImpl<M, N, PT, PP, D>

source§

fn fmt(&self, f: &mut Formatter<'_>) -> Result

Formats the value using the given formatter. Read more
source§

impl<'de, M, N, PT, PP, D> Deserialize<'de> for TokenizerImpl<M, N, PT, PP, D>
where M: Deserialize<'de> + Model, N: Deserialize<'de> + Normalizer, PT: Deserialize<'de> + PreTokenizer, PP: Deserialize<'de> + PostProcessor, D: Deserialize<'de> + Decoder,

source§

fn deserialize<De>(deserializer: De) -> Result<Self, De::Error>
where De: Deserializer<'de>,

Deserialize this value from the given Serde deserializer. Read more
source§

impl<M, N, PT, PP, D> From<TokenizerImpl<M, N, PT, PP, D>> for Tokenizer

source§

fn from(t: TokenizerImpl<M, N, PT, PP, D>) -> Self

Converts to this type from the input type.
source§

impl<M, N, PT, PP, D> FromStr for TokenizerImpl<M, N, PT, PP, D>
where M: for<'de> Deserialize<'de> + Model, N: for<'de> Deserialize<'de> + Normalizer, PT: for<'de> Deserialize<'de> + PreTokenizer, PP: for<'de> Deserialize<'de> + PostProcessor, D: for<'de> Deserialize<'de> + Decoder,

§

type Err = Box<dyn Error + Sync + Send>

The associated error which can be returned from parsing.
source§

fn from_str(s: &str) -> Result<Self>

Parses a string s to return a value of this type. Read more
source§

impl<M, N, PT, PP, D> Serialize for TokenizerImpl<M, N, PT, PP, D>
where M: Serialize, N: Serialize, PT: Serialize, PP: Serialize, D: Serialize,

source§

fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
where S: Serializer,

Serialize this value into the given Serde serializer. Read more

Auto Trait Implementations§

§

impl<M, N, PT, PP, D> Freeze for TokenizerImpl<M, N, PT, PP, D>
where M: Freeze, N: Freeze, PT: Freeze, PP: Freeze, D: Freeze,

§

impl<M, N, PT, PP, D> RefUnwindSafe for TokenizerImpl<M, N, PT, PP, D>

§

impl<M, N, PT, PP, D> Send for TokenizerImpl<M, N, PT, PP, D>
where M: Send, N: Send, PT: Send, PP: Send, D: Send,

§

impl<M, N, PT, PP, D> Sync for TokenizerImpl<M, N, PT, PP, D>
where M: Sync, N: Sync, PT: Sync, PP: Sync, D: Sync,

§

impl<M, N, PT, PP, D> Unpin for TokenizerImpl<M, N, PT, PP, D>
where M: Unpin, N: Unpin, PT: Unpin, PP: Unpin, D: Unpin,

§

impl<M, N, PT, PP, D> UnwindSafe for TokenizerImpl<M, N, PT, PP, D>

Blanket Implementations§

source§

impl<T> Any for T
where T: 'static + ?Sized,

source§

fn type_id(&self) -> TypeId

Gets the TypeId of self. Read more
source§

impl<T> Borrow<T> for T
where T: ?Sized,

source§

fn borrow(&self) -> &T

Immutably borrows from an owned value. Read more
source§

impl<T> BorrowMut<T> for T
where T: ?Sized,

source§

fn borrow_mut(&mut self) -> &mut T

Mutably borrows from an owned value. Read more
source§

impl<T> From<T> for T

source§

fn from(t: T) -> T

Returns the argument unchanged.

source§

impl<T, U> Into<U> for T
where U: From<T>,

source§

fn into(self) -> U

Calls U::from(self).

That is, this conversion is whatever the implementation of From<T> for U chooses to do.

source§

impl<T> IntoEither for T

source§

fn into_either(self, into_left: bool) -> Either<Self, Self>

Converts self into a Left variant of Either<Self, Self> if into_left is true. Converts self into a Right variant of Either<Self, Self> otherwise. Read more
source§

fn into_either_with<F>(self, into_left: F) -> Either<Self, Self>
where F: FnOnce(&Self) -> bool,

Converts self into a Left variant of Either<Self, Self> if into_left(&self) returns true. Converts self into a Right variant of Either<Self, Self> otherwise. Read more
source§

impl<T> Pointable for T

source§

const ALIGN: usize = _

The alignment of pointer.
§

type Init = T

The type for initializers.
source§

unsafe fn init(init: <T as Pointable>::Init) -> usize

Initializes a with the given initializer. Read more
source§

unsafe fn deref<'a>(ptr: usize) -> &'a T

Dereferences the given pointer. Read more
source§

unsafe fn deref_mut<'a>(ptr: usize) -> &'a mut T

Mutably dereferences the given pointer. Read more
source§

unsafe fn drop(ptr: usize)

Drops the object pointed to by the given pointer. Read more
source§

impl<T> ToOwned for T
where T: Clone,

§

type Owned = T

The resulting type after obtaining ownership.
source§

fn to_owned(&self) -> T

Creates owned data from borrowed data, usually by cloning. Read more
source§

fn clone_into(&self, target: &mut T)

Uses borrowed data to replace owned data, usually by cloning. Read more
source§

impl<T, U> TryFrom<U> for T
where U: Into<T>,

§

type Error = Infallible

The type returned in the event of a conversion error.
source§

fn try_from(value: U) -> Result<T, <T as TryFrom<U>>::Error>

Performs the conversion.
source§

impl<T, U> TryInto<U> for T
where U: TryFrom<T>,

§

type Error = <U as TryFrom<T>>::Error

The type returned in the event of a conversion error.
source§

fn try_into(self) -> Result<U, <U as TryFrom<T>>::Error>

Performs the conversion.
source§

impl<V, T> VZip<V> for T
where V: MultiLane<T>,

source§

fn vzip(self) -> V

source§

impl<T> DeserializeOwned for T
where T: for<'de> Deserialize<'de>,