pub struct Tokenizer(/* private fields */);
Implementations§
Source§impl Tokenizer
impl Tokenizer
Sourcepub fn new(model: impl Into<ModelWrapper>) -> Self
pub fn new(model: impl Into<ModelWrapper>) -> Self
Construct a new Tokenizer based on the model.
Sourcepub fn into_inner(
self,
) -> TokenizerImpl<ModelWrapper, NormalizerWrapper, PreTokenizerWrapper, PostProcessorWrapper, DecoderWrapper>
pub fn into_inner( self, ) -> TokenizerImpl<ModelWrapper, NormalizerWrapper, PreTokenizerWrapper, PostProcessorWrapper, DecoderWrapper>
Unwrap the TokenizerImpl.
pub fn from_file<P: AsRef<Path>>(file: P) -> Result<Self>
pub fn from_bytes<P: AsRef<[u8]>>(bytes: P) -> Result<Self>
Methods from Deref<Target = TokenizerImpl<ModelWrapper, NormalizerWrapper, PreTokenizerWrapper, PostProcessorWrapper, DecoderWrapper>>§
Sourcepub fn with_normalizer(&mut self, normalizer: Option<impl Into<N>>) -> &mut Self
pub fn with_normalizer(&mut self, normalizer: Option<impl Into<N>>) -> &mut Self
Set the normalizer
Sourcepub fn get_normalizer(&self) -> Option<&N>
pub fn get_normalizer(&self) -> Option<&N>
Get the normalizer
Sourcepub fn with_pre_tokenizer(
&mut self,
pre_tokenizer: Option<impl Into<PT>>,
) -> &mut Self
pub fn with_pre_tokenizer( &mut self, pre_tokenizer: Option<impl Into<PT>>, ) -> &mut Self
Set the pre tokenizer
Sourcepub fn get_pre_tokenizer(&self) -> Option<&PT>
pub fn get_pre_tokenizer(&self) -> Option<&PT>
Get the pre tokenizer
Sourcepub fn with_post_processor(
&mut self,
post_processor: Option<impl Into<PP>>,
) -> &mut Self
pub fn with_post_processor( &mut self, post_processor: Option<impl Into<PP>>, ) -> &mut Self
Set the post processor
Sourcepub fn get_post_processor(&self) -> Option<&PP>
pub fn get_post_processor(&self) -> Option<&PP>
Get the post processor
Sourcepub fn with_decoder(&mut self, decoder: Option<impl Into<D>>) -> &mut Self
pub fn with_decoder(&mut self, decoder: Option<impl Into<D>>) -> &mut Self
Set the decoder
Sourcepub fn get_decoder(&self) -> Option<&D>
pub fn get_decoder(&self) -> Option<&D>
Get the decoder
Sourcepub fn with_model(&mut self, model: impl Into<M>) -> &mut Self
pub fn with_model(&mut self, model: impl Into<M>) -> &mut Self
Set the model
Sourcepub fn with_added_vocabulary(
&mut self,
added_vocabulary: AddedVocabulary,
) -> &mut Self
pub fn with_added_vocabulary( &mut self, added_vocabulary: AddedVocabulary, ) -> &mut Self
Set the added vocabulary.
Sourcepub fn get_added_vocabulary(&self) -> &AddedVocabulary
pub fn get_added_vocabulary(&self) -> &AddedVocabulary
Get the added vocabulary
Sourcepub fn with_truncation(
&mut self,
trunc: Option<TruncationParams>,
) -> Result<&mut Self>
pub fn with_truncation( &mut self, trunc: Option<TruncationParams>, ) -> Result<&mut Self>
Set the truncation parameters
Fails if stride
is too high relative to max_length
and post_processor.added_tokens()
Sourcepub fn get_truncation(&self) -> Option<&TruncationParams>
pub fn get_truncation(&self) -> Option<&TruncationParams>
Get the currently set truncation parameters
Sourcepub fn get_truncation_mut(&mut self) -> Option<&mut TruncationParams>
pub fn get_truncation_mut(&mut self) -> Option<&mut TruncationParams>
Get a mutable reference to the currently set truncation parameters
Sourcepub fn with_padding(&mut self, padding: Option<PaddingParams>) -> &mut Self
pub fn with_padding(&mut self, padding: Option<PaddingParams>) -> &mut Self
Set the padding parameters
Sourcepub fn get_padding(&self) -> Option<&PaddingParams>
pub fn get_padding(&self) -> Option<&PaddingParams>
Get the currently set padding parameters
Sourcepub fn get_padding_mut(&mut self) -> Option<&mut PaddingParams>
pub fn get_padding_mut(&mut self) -> Option<&mut PaddingParams>
Get a mutable reference to the currently set padding parameters
pub fn get_vocab(&self, with_added_tokens: bool) -> HashMap<String, u32>
Sourcepub fn get_added_tokens_decoder(&self) -> AHashMap<u32, AddedToken>
pub fn get_added_tokens_decoder(&self) -> AHashMap<u32, AddedToken>
Get the added tokens decoder
Sourcepub fn get_vocab_size(&self, with_added_tokens: bool) -> usize
pub fn get_vocab_size(&self, with_added_tokens: bool) -> usize
Get the size of the vocabulary
Sourcepub fn token_to_id(&self, token: &str) -> Option<u32>
pub fn token_to_id(&self, token: &str) -> Option<u32>
Converts a token in the corresponding id.
Sourcepub fn id_to_token(&self, id: u32) -> Option<String>
pub fn id_to_token(&self, id: u32) -> Option<String>
Converts an id to the corresponding token.
Sourcepub fn set_encode_special_tokens(&mut self, value: bool)
pub fn set_encode_special_tokens(&mut self, value: bool)
set the added vocab’s splitting scheme
Sourcepub fn get_encode_special_tokens(&self) -> bool
pub fn get_encode_special_tokens(&self) -> bool
Get added token value
Sourcepub fn encode_fast<'s, E>(
&self,
input: E,
add_special_tokens: bool,
) -> Result<Encoding>where
E: Into<EncodeInput<'s>>,
pub fn encode_fast<'s, E>(
&self,
input: E,
add_special_tokens: bool,
) -> Result<Encoding>where
E: Into<EncodeInput<'s>>,
Encode the given input. This method accepts both single sequences, as well as pair
sequences. Also, a sequence can be a string, or already pre-tokenized input directly:
Contrarily to encode
, it does not compute offsets
// Sequences:
tokenizer.encode_fast("Single sequence", false);
tokenizer.encode_fast(("Sequence A", "Sequence B"), false);
// Pre-tokenized sequences:
tokenizer.encode_fast(&["Single", "sequence"][..], false);
tokenizer.encode_fast((
&["Sequence", "A"][..],
&["Sequence", "B"][..]
), false);
// or even both types together:
tokenizer.encode_fast(("A complete sequence", &["And", "a", "tokenized"][..]), false);
Sourcepub fn encode<'s, E>(
&self,
input: E,
add_special_tokens: bool,
) -> Result<Encoding>where
E: Into<EncodeInput<'s>>,
pub fn encode<'s, E>(
&self,
input: E,
add_special_tokens: bool,
) -> Result<Encoding>where
E: Into<EncodeInput<'s>>,
Encode the given input. This method accepts both single sequences, as well as pair sequences. Also, a sequence can be a string, or already pre-tokenized input directly:
// Sequences:
tokenizer.encode("Single sequence", false);
tokenizer.encode(("Sequence A", "Sequence B"), false);
// Pre-tokenized sequences:
tokenizer.encode(&["Single", "sequence"][..], false);
tokenizer.encode((
&["Sequence", "A"][..],
&["Sequence", "B"][..]
), false);
// or even both types together:
tokenizer.encode(("A complete sequence", &["And", "a", "tokenized"][..]), false);
Sourcepub fn encode_char_offsets<'s, E>(
&self,
input: E,
add_special_tokens: bool,
) -> Result<Encoding>where
E: Into<EncodeInput<'s>>,
pub fn encode_char_offsets<'s, E>(
&self,
input: E,
add_special_tokens: bool,
) -> Result<Encoding>where
E: Into<EncodeInput<'s>>,
Encode the given input, using offsets relative to chars instead of bytes. This method accepts both single sequences, as well as pair sequences. Also, a sequence can be a string, or already pre-tokenized input directly:
// Sequences:
tokenizer.encode("Single sequence", false);
tokenizer.encode(("Sequence A", "Sequence B"), false);
// Pre-tokenized sequences:
tokenizer.encode(&["Single", "sequence"][..], false);
tokenizer.encode((
&["Sequence", "A"][..],
&["Sequence", "B"][..]
), false);
// or even both types together:
tokenizer.encode(("A complete sequence", &["And", "a", "tokenized"][..]), false);
Sourcepub fn decode(&self, ids: &[u32], skip_special_tokens: bool) -> Result<String>
pub fn decode(&self, ids: &[u32], skip_special_tokens: bool) -> Result<String>
Decode the given ids, back to a String
Sourcepub fn decode_stream(
&self,
skip_special_tokens: bool,
) -> DecodeStream<'_, M, N, PT, PP, D>
pub fn decode_stream( &self, skip_special_tokens: bool, ) -> DecodeStream<'_, M, N, PT, PP, D>
Decode the given ids, back to a String
See DecodeStream
Sourcepub fn add_special_tokens(&mut self, tokens: &[AddedToken]) -> usize
pub fn add_special_tokens(&mut self, tokens: &[AddedToken]) -> usize
Register the given tokens as special tokens. This is especially useful for removing these special tokens while decoding
Sourcepub fn add_tokens(&mut self, tokens: &[AddedToken]) -> usize
pub fn add_tokens(&mut self, tokens: &[AddedToken]) -> usize
Add the given tokens to the added vocabulary
Sourcepub fn post_process(
&self,
encoding: Encoding,
pair_encoding: Option<Encoding>,
add_special_tokens: bool,
) -> Result<Encoding>
pub fn post_process( &self, encoding: Encoding, pair_encoding: Option<Encoding>, add_special_tokens: bool, ) -> Result<Encoding>
Post processing logic, handling the case where there is no PostProcessor set
Sourcepub fn encode_batch<'s, E>(
&self,
inputs: Vec<E>,
add_special_tokens: bool,
) -> Result<Vec<Encoding>>
pub fn encode_batch<'s, E>( &self, inputs: Vec<E>, add_special_tokens: bool, ) -> Result<Vec<Encoding>>
Encode all the sentences in parallel, using multiple threads
Sourcepub fn encode_batch_char_offsets<'s, E>(
&self,
inputs: Vec<E>,
add_special_tokens: bool,
) -> Result<Vec<Encoding>>
pub fn encode_batch_char_offsets<'s, E>( &self, inputs: Vec<E>, add_special_tokens: bool, ) -> Result<Vec<Encoding>>
Encode all the sentences in parallel, using multiple threads.
The offsets on each Encoding
will be relative to chars instead of bytes.
Sourcepub fn encode_batch_fast<'s, E>(
&self,
inputs: Vec<E>,
add_special_tokens: bool,
) -> Result<Vec<Encoding>>
pub fn encode_batch_fast<'s, E>( &self, inputs: Vec<E>, add_special_tokens: bool, ) -> Result<Vec<Encoding>>
Encode all the sentences in parallel, using multiple threads
Sourcepub fn decode_batch(
&self,
sentences: &[&[u32]],
skip_special_tokens: bool,
) -> Result<Vec<String>>
pub fn decode_batch( &self, sentences: &[&[u32]], skip_special_tokens: bool, ) -> Result<Vec<String>>
Decode all sentences in parallel
Sourcepub fn train_from_files<T>(
&mut self,
trainer: &mut T,
files: Vec<String>,
) -> Result<&mut Self>
pub fn train_from_files<T>( &mut self, trainer: &mut T, files: Vec<String>, ) -> Result<&mut Self>
Train our Model from files
Sourcepub fn train<T, I, S>(
&mut self,
trainer: &mut T,
sequences: I,
) -> Result<&mut Self>
pub fn train<T, I, S>( &mut self, trainer: &mut T, sequences: I, ) -> Result<&mut Self>
Train our Model, using the given Trainer and iterator
Trait Implementations§
Source§impl Deref for Tokenizer
impl Deref for Tokenizer
Source§type Target = TokenizerImpl<ModelWrapper, NormalizerWrapper, PreTokenizerWrapper, PostProcessorWrapper, DecoderWrapper>
type Target = TokenizerImpl<ModelWrapper, NormalizerWrapper, PreTokenizerWrapper, PostProcessorWrapper, DecoderWrapper>
Source§impl<'de> Deserialize<'de> for Tokenizer
impl<'de> Deserialize<'de> for Tokenizer
Source§fn deserialize<__D>(__deserializer: __D) -> Result<Self, __D::Error>where
__D: Deserializer<'de>,
fn deserialize<__D>(__deserializer: __D) -> Result<Self, __D::Error>where
__D: Deserializer<'de>,
Source§impl<M, N, PT, PP, D> From<TokenizerImpl<M, N, PT, PP, D>> for Tokenizerwhere
M: Into<ModelWrapper>,
N: Into<NormalizerWrapper>,
PT: Into<PreTokenizerWrapper>,
PP: Into<PostProcessorWrapper>,
D: Into<DecoderWrapper>,
impl<M, N, PT, PP, D> From<TokenizerImpl<M, N, PT, PP, D>> for Tokenizerwhere
M: Into<ModelWrapper>,
N: Into<NormalizerWrapper>,
PT: Into<PreTokenizerWrapper>,
PP: Into<PostProcessorWrapper>,
D: Into<DecoderWrapper>,
Source§fn from(t: TokenizerImpl<M, N, PT, PP, D>) -> Self
fn from(t: TokenizerImpl<M, N, PT, PP, D>) -> Self
Auto Trait Implementations§
impl !Freeze for Tokenizer
impl RefUnwindSafe for Tokenizer
impl Send for Tokenizer
impl Sync for Tokenizer
impl Unpin for Tokenizer
impl UnwindSafe for Tokenizer
Blanket Implementations§
Source§impl<T> BorrowMut<T> for Twhere
T: ?Sized,
impl<T> BorrowMut<T> for Twhere
T: ?Sized,
Source§fn borrow_mut(&mut self) -> &mut T
fn borrow_mut(&mut self) -> &mut T
Source§impl<T> CloneToUninit for Twhere
T: Clone,
impl<T> CloneToUninit for Twhere
T: Clone,
Source§impl<T> IntoEither for T
impl<T> IntoEither for T
Source§fn into_either(self, into_left: bool) -> Either<Self, Self>
fn into_either(self, into_left: bool) -> Either<Self, Self>
self
into a Left
variant of Either<Self, Self>
if into_left
is true
.
Converts self
into a Right
variant of Either<Self, Self>
otherwise. Read moreSource§fn into_either_with<F>(self, into_left: F) -> Either<Self, Self>
fn into_either_with<F>(self, into_left: F) -> Either<Self, Self>
self
into a Left
variant of Either<Self, Self>
if into_left(&self)
returns true
.
Converts self
into a Right
variant of Either<Self, Self>
otherwise. Read more