Struct tokenizers::tokenizer::Tokenizer

pub struct Tokenizer(_);

Implementations§

impl Tokenizer

pub fn new(model: impl Into<ModelWrapper>) -> Self

Construct a new Tokenizer based on the model.

pub fn into_inner( self ) -> TokenizerImpl<ModelWrapper, NormalizerWrapper, PreTokenizerWrapper, PostProcessorWrapper, DecoderWrapper>

Unwrap the TokenizerImpl.

pub fn from_file<P: AsRef<Path>>(file: P) -> Result<Self>

pub fn from_bytes<P: AsRef<[u8]>>(bytes: P) -> Result<Self>

pub fn from_pretrained<S: AsRef<str>>( identifier: S, params: Option<FromPretrainedParameters> ) -> Result<Self>

Methods from Deref<Target = TokenizerImpl<ModelWrapper, NormalizerWrapper, PreTokenizerWrapper, PostProcessorWrapper, DecoderWrapper>>§

pub fn with_normalizer(&mut self, normalizer: impl Into<N>) -> &mut Self

Set the normalizer

pub fn get_normalizer(&self) -> Option<&N>

Get the normalizer

pub fn with_pre_tokenizer(&mut self, pre_tokenizer: impl Into<PT>) -> &mut Self

Set the pre tokenizer

pub fn get_pre_tokenizer(&self) -> Option<&PT>

Get the pre tokenizer

pub fn with_post_processor( &mut self, post_processor: impl Into<PP> ) -> &mut Self

Set the post processor

pub fn get_post_processor(&self) -> Option<&PP>

Get the post processor

pub fn with_decoder(&mut self, decoder: impl Into<D>) -> &mut Self

Set the decoder

pub fn get_decoder(&self) -> Option<&D>

Get the decoder

pub fn with_model(&mut self, model: impl Into<M>) -> &mut Self

Set the model

pub fn get_model(&self) -> &M

Get the model

pub fn with_truncation(&mut self, trunc: Option<TruncationParams>) -> &mut Self

Set the truncation parameters

pub fn get_truncation(&self) -> Option<&TruncationParams>

Get the currently set truncation parameters

pub fn get_truncation_mut(&mut self) -> Option<&mut TruncationParams>

Get a mutable reference to the currently set truncation parameters

pub fn with_padding(&mut self, padding: Option<PaddingParams>) -> &mut Self

Set the padding parameters

pub fn get_padding(&self) -> Option<&PaddingParams>

Get the currently set padding parameters

pub fn get_padding_mut(&mut self) -> Option<&mut PaddingParams>

Get a mutable reference to the currently set padding parameters

pub fn get_vocab(&self, with_added_tokens: bool) -> HashMap<String, u32>

Get the vocabulary

pub fn get_vocab_size(&self, with_added_tokens: bool) -> usize

Get the size of the vocabulary

pub fn token_to_id(&self, token: &str) -> Option<u32>

Converts a token in the corresponding id.

pub fn id_to_token(&self, id: u32) -> Option<String>

Converts an id to the corresponding token.

pub fn encode<'s, E>( &self, input: E, add_special_tokens: bool ) -> Result<Encoding>where E: Into<EncodeInput<'s>>,

Encode the given input. This method accepts both single sequences, as well as pair sequences. Also, a sequence can be a string, or already pre-tokenized input directly:

// Sequences:
tokenizer.encode("Single sequence", false);
tokenizer.encode(("Sequence A", "Sequence B"), false);

// Pre-tokenized sequences:
tokenizer.encode(&["Single", "sequence"][..], false);
tokenizer.encode((
    &["Sequence", "A"][..],
    &["Sequence", "B"][..]
), false);

// or even both types together:
tokenizer.encode(("A complete sequence", &["And", "a", "tokenized"][..]), false);

pub fn encode_char_offsets<'s, E>( &self, input: E, add_special_tokens: bool ) -> Result<Encoding>where E: Into<EncodeInput<'s>>,

Encode the given input, using offsets relative to chars instead of bytes. This method accepts both single sequences, as well as pair sequences. Also, a sequence can be a string, or already pre-tokenized input directly:

// Sequences:
tokenizer.encode("Single sequence", false);
tokenizer.encode(("Sequence A", "Sequence B"), false);

// Pre-tokenized sequences:
tokenizer.encode(&["Single", "sequence"][..], false);
tokenizer.encode((
    &["Sequence", "A"][..],
    &["Sequence", "B"][..]
), false);

// or even both types together:
tokenizer.encode(("A complete sequence", &["And", "a", "tokenized"][..]), false);

pub fn decode(&self, ids: Vec<u32>, skip_special_tokens: bool) -> Result<String>

Decode the given ids, back to a String

pub fn add_special_tokens(&mut self, tokens: &[AddedToken]) -> usize

Register the given tokens as special tokens. This is especially useful for removing these special tokens while decoding

pub fn add_tokens(&mut self, tokens: &[AddedToken]) -> usize

Add the given tokens to the added vocabulary

pub fn post_process( &self, encoding: Encoding, pair_encoding: Option<Encoding>, add_special_tokens: bool ) -> Result<Encoding>

Post processing logic, handling the case where there is no PostProcessor set

pub fn encode_batch<'s, E>( &self, inputs: Vec<E>, add_special_tokens: bool ) -> Result<Vec<Encoding>>where E: Into<EncodeInput<'s>> + Send,

Encode all the sentences in parallel, using multiple threads

pub fn encode_batch_char_offsets<'s, E>( &self, inputs: Vec<E>, add_special_tokens: bool ) -> Result<Vec<Encoding>>where E: Into<EncodeInput<'s>> + Send,

Encode all the sentences in parallel, using multiple threads. The offsets on each Encoding will be relative to chars instead of bytes.

pub fn decode_batch( &self, sentences: Vec<Vec<u32>>, skip_special_tokens: bool ) -> Result<Vec<String>>where M: Send + Sync,

Decode all sentences in parallel

pub fn train_from_files<T>( &mut self, trainer: &mut T, files: Vec<String> ) -> Result<&mut Self>where T: Trainer<Model = M> + Sync,

Train our Model from files

pub fn train<T, I, S>( &mut self, trainer: &mut T, sequences: I ) -> Result<&mut Self>where T: Trainer<Model = M> + Sync, I: Iterator<Item = S> + Send, S: AsRef<str> + Send,

Train our Model, using the given Trainer and iterator

pub fn to_string(&self, pretty: bool) -> Result<String>

Serialize the current tokenizer as a String

pub fn save<P: AsRef<Path>>(&self, path: P, pretty: bool) -> Result<()>

Save the current tokenizer at the given path

Trait Implementations§

impl Clone for Tokenizer

fn clone(&self) -> Tokenizer

Returns a copy of the value. Read more

1.0.0 · source§

fn clone_from(&mut self, source: &Self)

Performs copy-assignment from source. Read more

impl Debug for Tokenizer

fn fmt(&self, f: &mut Formatter<'_>) -> Result

Formats the value using the given formatter. Read more

impl Deref for Tokenizer

type Target = TokenizerImpl<ModelWrapper, NormalizerWrapper, PreTokenizerWrapper, PostProcessorWrapper, DecoderWrapper>

The resulting type after dereferencing.

fn deref(&self) -> &Self::Target

Dereferences the value.

impl DerefMut for Tokenizer

fn deref_mut(&mut self) -> &mut Self::Target

Mutably dereferences the value.

impl<'de> Deserialize<'de> for Tokenizer

fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>where __D: Deserializer<'de>,

Deserialize this value from the given Serde deserializer. Read more

impl<M, N, PT, PP, D> From<TokenizerImpl<M, N, PT, PP, D>> for Tokenizerwhere M: Into<ModelWrapper>, N: Into<NormalizerWrapper>, PT: Into<PreTokenizerWrapper>, PP: Into<PostProcessorWrapper>, D: Into<DecoderWrapper>,

fn from(t: TokenizerImpl<M, N, PT, PP, D>) -> Self

Converts to this type from the input type.

impl FromStr for Tokenizer

type Err = Box<dyn Error + Send + Sync + 'static, Global>

The associated error which can be returned from parsing.

fn from_str(s: &str) -> Result<Self>

Parses a string s to return a value of this type. Read more

impl Serialize for Tokenizer

fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>where S: Serializer,

Serialize this value into the given Serde serializer. Read more

Auto Trait Implementations§

impl RefUnwindSafe for Tokenizer

impl Send for Tokenizer

impl Sync for Tokenizer

impl Unpin for Tokenizer

impl UnwindSafe for Tokenizer

Blanket Implementations§

impl<T> Any for Twhere T: 'static + ?Sized,

fn type_id(&self) -> TypeId

Gets the TypeId of self. Read more

impl<T> Borrow<T> for Twhere T: ?Sized,

const: unstable · source§

fn borrow(&self) -> &T

Immutably borrows from an owned value. Read more

impl<T> BorrowMut<T> for Twhere T: ?Sized,

const: unstable · source§

fn borrow_mut(&mut self) -> &mut T

Mutably borrows from an owned value. Read more

impl<T> From<T> for T

const: unstable · source§

fn from(t: T) -> T

Returns the argument unchanged.

impl<T> Instrument for T

fn instrument(self, span: Span) -> Instrumented<Self>

Instruments this type with the provided Span, returning an Instrumented wrapper. Read more

fn in_current_span(self) -> Instrumented<Self>

Instruments this type with the current Span, returning an Instrumented wrapper. Read more

impl<T, U> Into<U> for Twhere U: From<T>,

const: unstable · source§

fn into(self) -> U

Calls U::from(self).

That is, this conversion is whatever the implementation of From<T> for U chooses to do.

impl<T> Pointable for T

const ALIGN: usize = mem::align_of::<T>()

The alignment of pointer.

type Init = T

The type for initializers.

unsafe fn init(init: <T as Pointable>::Init) -> usize

Initializes a with the given initializer. Read more

unsafe fn deref<'a>(ptr: usize) -> &'a T

Dereferences the given pointer. Read more

unsafe fn deref_mut<'a>(ptr: usize) -> &'a mut T

Mutably dereferences the given pointer. Read more

unsafe fn drop(ptr: usize)

Drops the object pointed to by the given pointer. Read more

impl<T> Same<T> for T

type Output = T

Should always be Self

impl<T> ToOwned for Twhere T: Clone,

type Owned = T

The resulting type after obtaining ownership.

fn to_owned(&self) -> T

Creates owned data from borrowed data, usually by cloning. Read more

fn clone_into(&self, target: &mut T)

Uses borrowed data to replace owned data, usually by cloning. Read more

impl<T, U> TryFrom<U> for Twhere U: Into<T>,

type Error = Infallible

The type returned in the event of a conversion error.

const: unstable · source§

fn try_from(value: U) -> Result<T, <T as TryFrom<U>>::Error>

Performs the conversion.

impl<T, U> TryInto<U> for Twhere U: TryFrom<T>,

type Error = <U as TryFrom<T>>::Error

The type returned in the event of a conversion error.

const: unstable · source§

fn try_into(self) -> Result<U, <U as TryFrom<T>>::Error>

Performs the conversion.

impl<V, T> VZip<V> for Twhere V: MultiLane<T>,

fn vzip(self) -> V

impl<T> WithSubscriber for T

fn with_subscriber<S>(self, subscriber: S) -> WithDispatch<Self>where S: Into<Dispatch>,

Attaches the provided Subscriber to this type, returning a WithDispatch wrapper. Read more

fn with_current_subscriber(self) -> WithDispatch<Self>

Attaches the current default Subscriber to this type, returning a WithDispatch wrapper. Read more

impl<T> DeserializeOwned for Twhere T: for<'de> Deserialize<'de>,