pub enum TokenizerOption {
Show 19 variants
Bert(BertTokenizer),
Deberta(DeBERTaTokenizer),
DebertaV2(DeBERTaV2Tokenizer),
Roberta(RobertaTokenizer),
XLMRoberta(XLMRobertaTokenizer),
Marian(MarianTokenizer),
T5(T5Tokenizer),
Albert(AlbertTokenizer),
XLNet(XLNetTokenizer),
GPT2(Gpt2Tokenizer),
OpenAiGpt(OpenAiGptTokenizer),
Reformer(ReformerTokenizer),
ProphetNet(ProphetNetTokenizer),
Pegasus(PegasusTokenizer),
MBart50(MBart50Tokenizer),
M2M100(M2M100Tokenizer),
NLLB(NLLBTokenizer),
FNet(FNetTokenizer),
Bart(RobertaTokenizer),
}
Expand description
§Abstraction that holds a particular tokenizer, can be of any of the supported models
Variants§
Bert(BertTokenizer)
Bert Tokenizer
Deberta(DeBERTaTokenizer)
DeBERTa Tokenizer
DebertaV2(DeBERTaV2Tokenizer)
DeBERTa V2 Tokenizer
Roberta(RobertaTokenizer)
Roberta Tokenizer
XLMRoberta(XLMRobertaTokenizer)
XLMRoberta Tokenizer
Marian(MarianTokenizer)
Marian Tokenizer
T5(T5Tokenizer)
T5 Tokenizer
Albert(AlbertTokenizer)
Albert Tokenizer
XLNet(XLNetTokenizer)
XLNet Tokenizer
GPT2(Gpt2Tokenizer)
GPT2 Tokenizer
OpenAiGpt(OpenAiGptTokenizer)
GPT Tokenizer
Reformer(ReformerTokenizer)
Reformer Tokenizer
ProphetNet(ProphetNetTokenizer)
ProphetNet Tokenizer
Pegasus(PegasusTokenizer)
Pegasus Tokenizer
MBart50(MBart50Tokenizer)
MBart50 Tokenizer
M2M100(M2M100Tokenizer)
M2M100 Tokenizer
NLLB(NLLBTokenizer)
NLLB tokenizer.
FNet(FNetTokenizer)
FNet Tokenizer
Bart(RobertaTokenizer)
Bart Tokenizer
Implementations§
Source§impl TokenizerOption
impl TokenizerOption
Sourcepub fn from_file(
model_type: ModelType,
vocab_path: &str,
merges_path: Option<&str>,
lower_case: bool,
strip_accents: impl Into<Option<bool>>,
add_prefix_space: impl Into<Option<bool>>,
) -> Result<Self, RustBertError>
pub fn from_file( model_type: ModelType, vocab_path: &str, merges_path: Option<&str>, lower_case: bool, strip_accents: impl Into<Option<bool>>, add_prefix_space: impl Into<Option<bool>>, ) -> Result<Self, RustBertError>
Interface method to load a tokenizer from file
Sourcepub fn encode_list<S>(
&self,
text_list: &[S],
max_len: usize,
truncation_strategy: &TruncationStrategy,
stride: usize,
) -> Vec<TokenizedInput>
pub fn encode_list<S>( &self, text_list: &[S], max_len: usize, truncation_strategy: &TruncationStrategy, stride: usize, ) -> Vec<TokenizedInput>
Interface method
Sourcepub fn encode_pair_list(
&self,
text_pair_list: &[(&str, &str)],
max_len: usize,
truncation_strategy: &TruncationStrategy,
stride: usize,
) -> Vec<TokenizedInput>
pub fn encode_pair_list( &self, text_pair_list: &[(&str, &str)], max_len: usize, truncation_strategy: &TruncationStrategy, stride: usize, ) -> Vec<TokenizedInput>
Interface method for pair encoding
Sourcepub fn encode_pair(
&self,
text_1: &str,
text_2: Option<&str>,
max_len: usize,
truncation_strategy: &TruncationStrategy,
stride: usize,
) -> TokenizedInput
pub fn encode_pair( &self, text_1: &str, text_2: Option<&str>, max_len: usize, truncation_strategy: &TruncationStrategy, stride: usize, ) -> TokenizedInput
Interface method for pair encoding (single input)
Sourcepub fn tokenize_with_offsets(&self, text: &str) -> TokensWithOffsets
pub fn tokenize_with_offsets(&self, text: &str) -> TokensWithOffsets
Interface method to tokenization
Sourcepub fn tokenize_list<S>(&self, text: &[S]) -> Vec<Vec<String>>
pub fn tokenize_list<S>(&self, text: &[S]) -> Vec<Vec<String>>
Interface method to tokenization
Sourcepub fn decode(
&self,
token_ids: &[i64],
skip_special_tokens: bool,
clean_up_tokenization_spaces: bool,
) -> String
pub fn decode( &self, token_ids: &[i64], skip_special_tokens: bool, clean_up_tokenization_spaces: bool, ) -> String
Interface method to decoding
Sourcepub fn build_input_with_special_tokens(
&self,
token_ids_with_offsets_1: TokenIdsWithOffsets,
token_ids_with_offsets_2: Option<TokenIdsWithOffsets>,
) -> TokenizedInput
pub fn build_input_with_special_tokens( &self, token_ids_with_offsets_1: TokenIdsWithOffsets, token_ids_with_offsets_2: Option<TokenIdsWithOffsets>, ) -> TokenizedInput
Interface method to build input with special tokens
Sourcepub fn get_prefix_and_forced_bos_id(
&self,
source_language: Option<&Language>,
target_language: Option<&Language>,
supported_source_languages: &HashSet<Language>,
supported_target_languages: &HashSet<Language>,
) -> Result<(Option<String>, Option<i64>), RustBertError>
pub fn get_prefix_and_forced_bos_id( &self, source_language: Option<&Language>, target_language: Option<&Language>, supported_source_languages: &HashSet<Language>, supported_target_languages: &HashSet<Language>, ) -> Result<(Option<String>, Option<i64>), RustBertError>
Helper function to prepare the input for translation models
Sourcepub fn convert_tokens_to_ids<S>(&self, tokens: &[S]) -> Vec<i64>
pub fn convert_tokens_to_ids<S>(&self, tokens: &[S]) -> Vec<i64>
Interface method to convert tokens to ids
Sourcepub fn get_unk_id(&self) -> i64
pub fn get_unk_id(&self) -> i64
Interface method
Sourcepub fn get_pad_id(&self) -> Option<i64>
pub fn get_pad_id(&self) -> Option<i64>
Interface method
Sourcepub fn get_sep_id(&self) -> Option<i64>
pub fn get_sep_id(&self) -> Option<i64>
Interface method
Sourcepub fn get_mask_id(&self) -> Option<i64>
pub fn get_mask_id(&self) -> Option<i64>
Interface method
Sourcepub fn get_mask_value(&self) -> Option<&str>
pub fn get_mask_value(&self) -> Option<&str>
Interface method
Sourcepub fn get_bos_id(&self) -> Option<i64>
pub fn get_bos_id(&self) -> Option<i64>
Interface method
Sourcepub fn get_eos_id(&self) -> Option<i64>
pub fn get_eos_id(&self) -> Option<i64>
Interface method
pub fn tokenize_and_pad<'a, S>( &self, input: S, max_length: usize, device: Device, ) -> (Tensor, Tensor)
Sourcepub fn add_extra_ids(&mut self, num_extra_ids: i64)
pub fn add_extra_ids(&mut self, num_extra_ids: i64)
Interface method
Sourcepub fn add_tokens(&mut self, tokens: &[&str])
pub fn add_tokens(&mut self, tokens: &[&str])
Interface method
Auto Trait Implementations§
impl !Freeze for TokenizerOption
impl RefUnwindSafe for TokenizerOption
impl Send for TokenizerOption
impl Sync for TokenizerOption
impl Unpin for TokenizerOption
impl UnwindSafe for TokenizerOption
Blanket Implementations§
Source§impl<T> BorrowMut<T> for Twhere
T: ?Sized,
impl<T> BorrowMut<T> for Twhere
T: ?Sized,
Source§fn borrow_mut(&mut self) -> &mut T
fn borrow_mut(&mut self) -> &mut T
Source§impl<T> Instrument for T
impl<T> Instrument for T
Source§fn instrument(self, span: Span) -> Instrumented<Self>
fn instrument(self, span: Span) -> Instrumented<Self>
Source§fn in_current_span(self) -> Instrumented<Self>
fn in_current_span(self) -> Instrumented<Self>
Source§impl<T> IntoEither for T
impl<T> IntoEither for T
Source§fn into_either(self, into_left: bool) -> Either<Self, Self>
fn into_either(self, into_left: bool) -> Either<Self, Self>
self
into a Left
variant of Either<Self, Self>
if into_left
is true
.
Converts self
into a Right
variant of Either<Self, Self>
otherwise. Read moreSource§fn into_either_with<F>(self, into_left: F) -> Either<Self, Self>
fn into_either_with<F>(self, into_left: F) -> Either<Self, Self>
self
into a Left
variant of Either<Self, Self>
if into_left(&self)
returns true
.
Converts self
into a Right
variant of Either<Self, Self>
otherwise. Read more