pub struct TokenizerImpl<M, N, PT, PP, D> { /* private fields */ }
Expand description
A Tokenizer
is capable of encoding/decoding any text.
Implementations§
Source§impl<M, N, PT, PP, D> TokenizerImpl<M, N, PT, PP, D>
impl<M, N, PT, PP, D> TokenizerImpl<M, N, PT, PP, D>
Sourcepub fn with_normalizer(&mut self, normalizer: Option<impl Into<N>>) -> &mut Self
pub fn with_normalizer(&mut self, normalizer: Option<impl Into<N>>) -> &mut Self
Set the normalizer
Sourcepub fn get_normalizer(&self) -> Option<&N>
pub fn get_normalizer(&self) -> Option<&N>
Get the normalizer
Sourcepub fn with_pre_tokenizer(
&mut self,
pre_tokenizer: Option<impl Into<PT>>,
) -> &mut Self
pub fn with_pre_tokenizer( &mut self, pre_tokenizer: Option<impl Into<PT>>, ) -> &mut Self
Set the pre tokenizer
Sourcepub fn get_pre_tokenizer(&self) -> Option<&PT>
pub fn get_pre_tokenizer(&self) -> Option<&PT>
Get the pre tokenizer
Sourcepub fn with_post_processor(
&mut self,
post_processor: Option<impl Into<PP>>,
) -> &mut Self
pub fn with_post_processor( &mut self, post_processor: Option<impl Into<PP>>, ) -> &mut Self
Set the post processor
Sourcepub fn get_post_processor(&self) -> Option<&PP>
pub fn get_post_processor(&self) -> Option<&PP>
Get the post processor
Sourcepub fn with_decoder(&mut self, decoder: Option<impl Into<D>>) -> &mut Self
pub fn with_decoder(&mut self, decoder: Option<impl Into<D>>) -> &mut Self
Set the decoder
Sourcepub fn get_decoder(&self) -> Option<&D>
pub fn get_decoder(&self) -> Option<&D>
Get the decoder
Sourcepub fn with_model(&mut self, model: impl Into<M>) -> &mut Self
pub fn with_model(&mut self, model: impl Into<M>) -> &mut Self
Set the model
Sourcepub fn with_added_vocabulary(
&mut self,
added_vocabulary: AddedVocabulary,
) -> &mut Self
pub fn with_added_vocabulary( &mut self, added_vocabulary: AddedVocabulary, ) -> &mut Self
Set the added vocabulary.
Sourcepub fn get_added_vocabulary(&self) -> &AddedVocabulary
pub fn get_added_vocabulary(&self) -> &AddedVocabulary
Get the added vocabulary
Sourcepub fn with_truncation(
&mut self,
trunc: Option<TruncationParams>,
) -> Result<&mut Self>
pub fn with_truncation( &mut self, trunc: Option<TruncationParams>, ) -> Result<&mut Self>
Set the truncation parameters
Fails if stride
is too high relative to max_length
and post_processor.added_tokens()
Sourcepub fn get_truncation(&self) -> Option<&TruncationParams>
pub fn get_truncation(&self) -> Option<&TruncationParams>
Get the currently set truncation parameters
Sourcepub fn get_truncation_mut(&mut self) -> Option<&mut TruncationParams>
pub fn get_truncation_mut(&mut self) -> Option<&mut TruncationParams>
Get a mutable reference to the currently set truncation parameters
Sourcepub fn with_padding(&mut self, padding: Option<PaddingParams>) -> &mut Self
pub fn with_padding(&mut self, padding: Option<PaddingParams>) -> &mut Self
Set the padding parameters
Sourcepub fn get_padding(&self) -> Option<&PaddingParams>
pub fn get_padding(&self) -> Option<&PaddingParams>
Get the currently set padding parameters
Sourcepub fn get_padding_mut(&mut self) -> Option<&mut PaddingParams>
pub fn get_padding_mut(&mut self) -> Option<&mut PaddingParams>
Get a mutable reference to the currently set padding parameters
pub fn get_vocab(&self, with_added_tokens: bool) -> HashMap<String, u32>
Sourcepub fn get_added_tokens_decoder(&self) -> AHashMap<u32, AddedToken>
pub fn get_added_tokens_decoder(&self) -> AHashMap<u32, AddedToken>
Get the added tokens decoder
Sourcepub fn get_vocab_size(&self, with_added_tokens: bool) -> usize
pub fn get_vocab_size(&self, with_added_tokens: bool) -> usize
Get the size of the vocabulary
Sourcepub fn token_to_id(&self, token: &str) -> Option<u32>
pub fn token_to_id(&self, token: &str) -> Option<u32>
Converts a token in the corresponding id.
Sourcepub fn id_to_token(&self, id: u32) -> Option<String>
pub fn id_to_token(&self, id: u32) -> Option<String>
Converts an id to the corresponding token.
Sourcepub fn set_encode_special_tokens(&mut self, value: bool)
pub fn set_encode_special_tokens(&mut self, value: bool)
set the added vocab’s splitting scheme
Sourcepub fn get_encode_special_tokens(&self) -> bool
pub fn get_encode_special_tokens(&self) -> bool
Get added token value
Sourcepub fn encode_fast<'s, E>(
&self,
input: E,
add_special_tokens: bool,
) -> Result<Encoding>where
E: Into<EncodeInput<'s>>,
pub fn encode_fast<'s, E>(
&self,
input: E,
add_special_tokens: bool,
) -> Result<Encoding>where
E: Into<EncodeInput<'s>>,
Encode the given input. This method accepts both single sequences, as well as pair
sequences. Also, a sequence can be a string, or already pre-tokenized input directly:
Contrarily to encode
, it does not compute offsets
// Sequences:
tokenizer.encode_fast("Single sequence", false);
tokenizer.encode_fast(("Sequence A", "Sequence B"), false);
// Pre-tokenized sequences:
tokenizer.encode_fast(&["Single", "sequence"][..], false);
tokenizer.encode_fast((
&["Sequence", "A"][..],
&["Sequence", "B"][..]
), false);
// or even both types together:
tokenizer.encode_fast(("A complete sequence", &["And", "a", "tokenized"][..]), false);
Sourcepub fn encode<'s, E>(
&self,
input: E,
add_special_tokens: bool,
) -> Result<Encoding>where
E: Into<EncodeInput<'s>>,
pub fn encode<'s, E>(
&self,
input: E,
add_special_tokens: bool,
) -> Result<Encoding>where
E: Into<EncodeInput<'s>>,
Encode the given input. This method accepts both single sequences, as well as pair sequences. Also, a sequence can be a string, or already pre-tokenized input directly:
// Sequences:
tokenizer.encode("Single sequence", false);
tokenizer.encode(("Sequence A", "Sequence B"), false);
// Pre-tokenized sequences:
tokenizer.encode(&["Single", "sequence"][..], false);
tokenizer.encode((
&["Sequence", "A"][..],
&["Sequence", "B"][..]
), false);
// or even both types together:
tokenizer.encode(("A complete sequence", &["And", "a", "tokenized"][..]), false);
Sourcepub fn encode_char_offsets<'s, E>(
&self,
input: E,
add_special_tokens: bool,
) -> Result<Encoding>where
E: Into<EncodeInput<'s>>,
pub fn encode_char_offsets<'s, E>(
&self,
input: E,
add_special_tokens: bool,
) -> Result<Encoding>where
E: Into<EncodeInput<'s>>,
Encode the given input, using offsets relative to chars instead of bytes. This method accepts both single sequences, as well as pair sequences. Also, a sequence can be a string, or already pre-tokenized input directly:
// Sequences:
tokenizer.encode("Single sequence", false);
tokenizer.encode(("Sequence A", "Sequence B"), false);
// Pre-tokenized sequences:
tokenizer.encode(&["Single", "sequence"][..], false);
tokenizer.encode((
&["Sequence", "A"][..],
&["Sequence", "B"][..]
), false);
// or even both types together:
tokenizer.encode(("A complete sequence", &["And", "a", "tokenized"][..]), false);
Sourcepub fn decode(&self, ids: &[u32], skip_special_tokens: bool) -> Result<String>
pub fn decode(&self, ids: &[u32], skip_special_tokens: bool) -> Result<String>
Decode the given ids, back to a String
Sourcepub fn decode_stream(
&self,
skip_special_tokens: bool,
) -> DecodeStream<'_, M, N, PT, PP, D>
pub fn decode_stream( &self, skip_special_tokens: bool, ) -> DecodeStream<'_, M, N, PT, PP, D>
Decode the given ids, back to a String
See DecodeStream
Source§impl<M, N, PT, PP, D> TokenizerImpl<M, N, PT, PP, D>where
N: Normalizer,
M: Model,
impl<M, N, PT, PP, D> TokenizerImpl<M, N, PT, PP, D>where
N: Normalizer,
M: Model,
Sourcepub fn add_special_tokens(&mut self, tokens: &[AddedToken]) -> usize
pub fn add_special_tokens(&mut self, tokens: &[AddedToken]) -> usize
Register the given tokens as special tokens. This is especially useful for removing these special tokens while decoding
Sourcepub fn add_tokens(&mut self, tokens: &[AddedToken]) -> usize
pub fn add_tokens(&mut self, tokens: &[AddedToken]) -> usize
Add the given tokens to the added vocabulary
Source§impl<M, N, PT, PP, D> TokenizerImpl<M, N, PT, PP, D>where
PP: PostProcessor,
impl<M, N, PT, PP, D> TokenizerImpl<M, N, PT, PP, D>where
PP: PostProcessor,
Source§impl<M, N, PT, PP, D> TokenizerImpl<M, N, PT, PP, D>where
M: Model + Send + Sync,
N: Normalizer + Send + Sync,
PT: PreTokenizer + Send + Sync,
PP: PostProcessor + Send + Sync,
D: Decoder + Send + Sync,
impl<M, N, PT, PP, D> TokenizerImpl<M, N, PT, PP, D>where
M: Model + Send + Sync,
N: Normalizer + Send + Sync,
PT: PreTokenizer + Send + Sync,
PP: PostProcessor + Send + Sync,
D: Decoder + Send + Sync,
Sourcepub fn encode_batch<'s, E>(
&self,
inputs: Vec<E>,
add_special_tokens: bool,
) -> Result<Vec<Encoding>>
pub fn encode_batch<'s, E>( &self, inputs: Vec<E>, add_special_tokens: bool, ) -> Result<Vec<Encoding>>
Encode all the sentences in parallel, using multiple threads
Sourcepub fn encode_batch_char_offsets<'s, E>(
&self,
inputs: Vec<E>,
add_special_tokens: bool,
) -> Result<Vec<Encoding>>
pub fn encode_batch_char_offsets<'s, E>( &self, inputs: Vec<E>, add_special_tokens: bool, ) -> Result<Vec<Encoding>>
Encode all the sentences in parallel, using multiple threads.
The offsets on each Encoding
will be relative to chars instead of bytes.
Sourcepub fn encode_batch_fast<'s, E>(
&self,
inputs: Vec<E>,
add_special_tokens: bool,
) -> Result<Vec<Encoding>>
pub fn encode_batch_fast<'s, E>( &self, inputs: Vec<E>, add_special_tokens: bool, ) -> Result<Vec<Encoding>>
Encode all the sentences in parallel, using multiple threads
Sourcepub fn decode_batch(
&self,
sentences: &[&[u32]],
skip_special_tokens: bool,
) -> Result<Vec<String>>
pub fn decode_batch( &self, sentences: &[&[u32]], skip_special_tokens: bool, ) -> Result<Vec<String>>
Decode all sentences in parallel
Source§impl<M, N, PT, PP, D> TokenizerImpl<M, N, PT, PP, D>where
M: DeserializeOwned + Model,
N: DeserializeOwned + Normalizer,
PT: DeserializeOwned + PreTokenizer,
PP: DeserializeOwned + PostProcessor,
D: DeserializeOwned + Decoder,
impl<M, N, PT, PP, D> TokenizerImpl<M, N, PT, PP, D>where
M: DeserializeOwned + Model,
N: DeserializeOwned + Normalizer,
PT: DeserializeOwned + PreTokenizer,
PP: DeserializeOwned + PostProcessor,
D: DeserializeOwned + Decoder,
Source§impl<M, N, PT, PP, D> TokenizerImpl<M, N, PT, PP, D>where
M: DeserializeOwned + Model,
N: DeserializeOwned + Normalizer,
PT: DeserializeOwned + PreTokenizer,
PP: DeserializeOwned + PostProcessor,
D: DeserializeOwned + Decoder,
impl<M, N, PT, PP, D> TokenizerImpl<M, N, PT, PP, D>where
M: DeserializeOwned + Model,
N: DeserializeOwned + Normalizer,
PT: DeserializeOwned + PreTokenizer,
PP: DeserializeOwned + PostProcessor,
D: DeserializeOwned + Decoder,
Sourcepub fn from_bytes<P: AsRef<[u8]>>(bytes: P) -> Result<Self>
pub fn from_bytes<P: AsRef<[u8]>>(bytes: P) -> Result<Self>
Instantiate a new Tokenizer from bytes
Source§impl<M, N, PT, PP, D> TokenizerImpl<M, N, PT, PP, D>
impl<M, N, PT, PP, D> TokenizerImpl<M, N, PT, PP, D>
Trait Implementations§
Source§impl<M: Clone, N: Clone, PT: Clone, PP: Clone, D: Clone> Clone for TokenizerImpl<M, N, PT, PP, D>
impl<M: Clone, N: Clone, PT: Clone, PP: Clone, D: Clone> Clone for TokenizerImpl<M, N, PT, PP, D>
Source§fn clone(&self) -> TokenizerImpl<M, N, PT, PP, D>
fn clone(&self) -> TokenizerImpl<M, N, PT, PP, D>
1.0.0 · Source§const fn clone_from(&mut self, source: &Self)
const fn clone_from(&mut self, source: &Self)
source
. Read moreSource§impl<M: Debug, N: Debug, PT: Debug, PP: Debug, D: Debug> Debug for TokenizerImpl<M, N, PT, PP, D>
impl<M: Debug, N: Debug, PT: Debug, PP: Debug, D: Debug> Debug for TokenizerImpl<M, N, PT, PP, D>
Source§impl<'de, M, N, PT, PP, D> Deserialize<'de> for TokenizerImpl<M, N, PT, PP, D>where
M: Deserialize<'de> + Model,
N: Deserialize<'de> + Normalizer,
PT: Deserialize<'de> + PreTokenizer,
PP: Deserialize<'de> + PostProcessor,
D: Deserialize<'de> + Decoder,
impl<'de, M, N, PT, PP, D> Deserialize<'de> for TokenizerImpl<M, N, PT, PP, D>where
M: Deserialize<'de> + Model,
N: Deserialize<'de> + Normalizer,
PT: Deserialize<'de> + PreTokenizer,
PP: Deserialize<'de> + PostProcessor,
D: Deserialize<'de> + Decoder,
Source§fn deserialize<De>(deserializer: De) -> Result<Self, De::Error>where
De: Deserializer<'de>,
fn deserialize<De>(deserializer: De) -> Result<Self, De::Error>where
De: Deserializer<'de>,
Source§impl<M, N, PT, PP, D> From<TokenizerImpl<M, N, PT, PP, D>> for Tokenizerwhere
M: Into<ModelWrapper>,
N: Into<NormalizerWrapper>,
PT: Into<PreTokenizerWrapper>,
PP: Into<PostProcessorWrapper>,
D: Into<DecoderWrapper>,
impl<M, N, PT, PP, D> From<TokenizerImpl<M, N, PT, PP, D>> for Tokenizerwhere
M: Into<ModelWrapper>,
N: Into<NormalizerWrapper>,
PT: Into<PreTokenizerWrapper>,
PP: Into<PostProcessorWrapper>,
D: Into<DecoderWrapper>,
Source§fn from(t: TokenizerImpl<M, N, PT, PP, D>) -> Self
fn from(t: TokenizerImpl<M, N, PT, PP, D>) -> Self
Source§impl<M, N, PT, PP, D> FromStr for TokenizerImpl<M, N, PT, PP, D>where
M: for<'de> Deserialize<'de> + Model,
N: for<'de> Deserialize<'de> + Normalizer,
PT: for<'de> Deserialize<'de> + PreTokenizer,
PP: for<'de> Deserialize<'de> + PostProcessor,
D: for<'de> Deserialize<'de> + Decoder,
impl<M, N, PT, PP, D> FromStr for TokenizerImpl<M, N, PT, PP, D>where
M: for<'de> Deserialize<'de> + Model,
N: for<'de> Deserialize<'de> + Normalizer,
PT: for<'de> Deserialize<'de> + PreTokenizer,
PP: for<'de> Deserialize<'de> + PostProcessor,
D: for<'de> Deserialize<'de> + Decoder,
Source§impl<M, N, PT, PP, D> Serialize for TokenizerImpl<M, N, PT, PP, D>
impl<M, N, PT, PP, D> Serialize for TokenizerImpl<M, N, PT, PP, D>
Auto Trait Implementations§
impl<M, N, PT, PP, D> Freeze for TokenizerImpl<M, N, PT, PP, D>
impl<M, N, PT, PP, D> RefUnwindSafe for TokenizerImpl<M, N, PT, PP, D>
impl<M, N, PT, PP, D> Send for TokenizerImpl<M, N, PT, PP, D>
impl<M, N, PT, PP, D> Sync for TokenizerImpl<M, N, PT, PP, D>
impl<M, N, PT, PP, D> Unpin for TokenizerImpl<M, N, PT, PP, D>
impl<M, N, PT, PP, D> UnwindSafe for TokenizerImpl<M, N, PT, PP, D>
Blanket Implementations§
Source§impl<T> BorrowMut<T> for Twhere
T: ?Sized,
impl<T> BorrowMut<T> for Twhere
T: ?Sized,
Source§fn borrow_mut(&mut self) -> &mut T
fn borrow_mut(&mut self) -> &mut T
Source§impl<T> CloneToUninit for Twhere
T: Clone,
impl<T> CloneToUninit for Twhere
T: Clone,
Source§impl<T> IntoEither for T
impl<T> IntoEither for T
Source§fn into_either(self, into_left: bool) -> Either<Self, Self>
fn into_either(self, into_left: bool) -> Either<Self, Self>
self
into a Left
variant of Either<Self, Self>
if into_left
is true
.
Converts self
into a Right
variant of Either<Self, Self>
otherwise. Read moreSource§fn into_either_with<F>(self, into_left: F) -> Either<Self, Self>
fn into_either_with<F>(self, into_left: F) -> Either<Self, Self>
self
into a Left
variant of Either<Self, Self>
if into_left(&self)
returns true
.
Converts self
into a Right
variant of Either<Self, Self>
otherwise. Read more