Enum TokenizerOption

Source
pub enum TokenizerOption {
Show 19 variants Bert(BertTokenizer), Deberta(DeBERTaTokenizer), DebertaV2(DeBERTaV2Tokenizer), Roberta(RobertaTokenizer), XLMRoberta(XLMRobertaTokenizer), Marian(MarianTokenizer), T5(T5Tokenizer), Albert(AlbertTokenizer), XLNet(XLNetTokenizer), GPT2(Gpt2Tokenizer), OpenAiGpt(OpenAiGptTokenizer), Reformer(ReformerTokenizer), ProphetNet(ProphetNetTokenizer), Pegasus(PegasusTokenizer), MBart50(MBart50Tokenizer), M2M100(M2M100Tokenizer), NLLB(NLLBTokenizer), FNet(FNetTokenizer), Bart(RobertaTokenizer),
}
Expand description

§Abstraction that holds a particular tokenizer, can be of any of the supported models

Variants§

§

Bert(BertTokenizer)

Bert Tokenizer

§

Deberta(DeBERTaTokenizer)

DeBERTa Tokenizer

§

DebertaV2(DeBERTaV2Tokenizer)

DeBERTa V2 Tokenizer

§

Roberta(RobertaTokenizer)

Roberta Tokenizer

§

XLMRoberta(XLMRobertaTokenizer)

XLMRoberta Tokenizer

§

Marian(MarianTokenizer)

Marian Tokenizer

§

T5(T5Tokenizer)

T5 Tokenizer

§

Albert(AlbertTokenizer)

Albert Tokenizer

§

XLNet(XLNetTokenizer)

XLNet Tokenizer

§

GPT2(Gpt2Tokenizer)

GPT2 Tokenizer

§

OpenAiGpt(OpenAiGptTokenizer)

GPT Tokenizer

§

Reformer(ReformerTokenizer)

Reformer Tokenizer

§

ProphetNet(ProphetNetTokenizer)

ProphetNet Tokenizer

§

Pegasus(PegasusTokenizer)

Pegasus Tokenizer

§

MBart50(MBart50Tokenizer)

MBart50 Tokenizer

§

M2M100(M2M100Tokenizer)

M2M100 Tokenizer

§

NLLB(NLLBTokenizer)

NLLB tokenizer.

§

FNet(FNetTokenizer)

FNet Tokenizer

§

Bart(RobertaTokenizer)

Bart Tokenizer

Implementations§

Source§

impl TokenizerOption

Source

pub fn from_file( model_type: ModelType, vocab_path: &str, merges_path: Option<&str>, lower_case: bool, strip_accents: impl Into<Option<bool>>, add_prefix_space: impl Into<Option<bool>>, ) -> Result<Self, RustBertError>

Interface method to load a tokenizer from file

Source

pub fn encode_list<S>( &self, text_list: &[S], max_len: usize, truncation_strategy: &TruncationStrategy, stride: usize, ) -> Vec<TokenizedInput>
where S: AsRef<str> + Send + Sync,

Interface method

Source

pub fn encode_pair_list( &self, text_pair_list: &[(&str, &str)], max_len: usize, truncation_strategy: &TruncationStrategy, stride: usize, ) -> Vec<TokenizedInput>

Interface method for pair encoding

Source

pub fn encode_pair( &self, text_1: &str, text_2: Option<&str>, max_len: usize, truncation_strategy: &TruncationStrategy, stride: usize, ) -> TokenizedInput

Interface method for pair encoding (single input)

Source

pub fn tokenize(&self, text: &str) -> Vec<String>

Interface method to tokenization

Source

pub fn tokenize_with_offsets(&self, text: &str) -> TokensWithOffsets

Interface method to tokenization

Source

pub fn tokenize_list<S>(&self, text: &[S]) -> Vec<Vec<String>>
where S: AsRef<str> + Send + Sync,

Interface method to tokenization

Source

pub fn decode( &self, token_ids: &[i64], skip_special_tokens: bool, clean_up_tokenization_spaces: bool, ) -> String

Interface method to decoding

Source

pub fn build_input_with_special_tokens( &self, token_ids_with_offsets_1: TokenIdsWithOffsets, token_ids_with_offsets_2: Option<TokenIdsWithOffsets>, ) -> TokenizedInput

Interface method to build input with special tokens

Source

pub fn get_prefix_and_forced_bos_id( &self, source_language: Option<&Language>, target_language: Option<&Language>, supported_source_languages: &HashSet<Language>, supported_target_languages: &HashSet<Language>, ) -> Result<(Option<String>, Option<i64>), RustBertError>

Helper function to prepare the input for translation models

Source

pub fn convert_tokens_to_ids<S>(&self, tokens: &[S]) -> Vec<i64>
where S: AsRef<str>,

Interface method to convert tokens to ids

Source

pub fn get_unk_id(&self) -> i64

Interface method

Source

pub fn get_pad_id(&self) -> Option<i64>

Interface method

Source

pub fn get_sep_id(&self) -> Option<i64>

Interface method

Source

pub fn get_mask_id(&self) -> Option<i64>

Interface method

Source

pub fn get_mask_value(&self) -> Option<&str>

Interface method

Source

pub fn get_bos_id(&self) -> Option<i64>

Interface method

Source

pub fn get_eos_id(&self) -> Option<i64>

Interface method

Source

pub fn tokenize_and_pad<'a, S>( &self, input: S, max_length: usize, device: Device, ) -> (Tensor, Tensor)
where S: AsRef<[&'a str]>,

Source

pub fn add_extra_ids(&mut self, num_extra_ids: i64)

Interface method

Source

pub fn add_tokens(&mut self, tokens: &[&str])

Interface method

Auto Trait Implementations§

Blanket Implementations§

Source§

impl<T> Any for T
where T: 'static + ?Sized,

Source§

fn type_id(&self) -> TypeId

Gets the TypeId of self. Read more
Source§

impl<T> Borrow<T> for T
where T: ?Sized,

Source§

fn borrow(&self) -> &T

Immutably borrows from an owned value. Read more
Source§

impl<T> BorrowMut<T> for T
where T: ?Sized,

Source§

fn borrow_mut(&mut self) -> &mut T

Mutably borrows from an owned value. Read more
Source§

impl<T> From<T> for T

Source§

fn from(t: T) -> T

Returns the argument unchanged.

Source§

impl<T> Instrument for T

Source§

fn instrument(self, span: Span) -> Instrumented<Self>

Instruments this type with the provided Span, returning an Instrumented wrapper. Read more
Source§

fn in_current_span(self) -> Instrumented<Self>

Instruments this type with the current Span, returning an Instrumented wrapper. Read more
Source§

impl<T, U> Into<U> for T
where U: From<T>,

Source§

fn into(self) -> U

Calls U::from(self).

That is, this conversion is whatever the implementation of From<T> for U chooses to do.

Source§

impl<T> IntoEither for T

Source§

fn into_either(self, into_left: bool) -> Either<Self, Self>

Converts self into a Left variant of Either<Self, Self> if into_left is true. Converts self into a Right variant of Either<Self, Self> otherwise. Read more
Source§

fn into_either_with<F>(self, into_left: F) -> Either<Self, Self>
where F: FnOnce(&Self) -> bool,

Converts self into a Left variant of Either<Self, Self> if into_left(&self) returns true. Converts self into a Right variant of Either<Self, Self> otherwise. Read more
Source§

impl<T> Pointable for T

Source§

const ALIGN: usize

The alignment of pointer.
Source§

type Init = T

The type for initializers.
Source§

unsafe fn init(init: <T as Pointable>::Init) -> usize

Initializes a with the given initializer. Read more
Source§

unsafe fn deref<'a>(ptr: usize) -> &'a T

Dereferences the given pointer. Read more
Source§

unsafe fn deref_mut<'a>(ptr: usize) -> &'a mut T

Mutably dereferences the given pointer. Read more
Source§

unsafe fn drop(ptr: usize)

Drops the object pointed to by the given pointer. Read more
Source§

impl<T> Same for T

Source§

type Output = T

Should always be Self
Source§

impl<T, U> TryFrom<U> for T
where U: Into<T>,

Source§

type Error = Infallible

The type returned in the event of a conversion error.
Source§

fn try_from(value: U) -> Result<T, <T as TryFrom<U>>::Error>

Performs the conversion.
Source§

impl<T, U> TryInto<U> for T
where U: TryFrom<T>,

Source§

type Error = <U as TryFrom<T>>::Error

The type returned in the event of a conversion error.
Source§

fn try_into(self) -> Result<U, <U as TryFrom<T>>::Error>

Performs the conversion.
Source§

impl<V, T> VZip<V> for T
where V: MultiLane<T>,

Source§

fn vzip(self) -> V

Source§

impl<T> WithSubscriber for T

Source§

fn with_subscriber<S>(self, subscriber: S) -> WithDispatch<Self>
where S: Into<Dispatch>,

Attaches the provided Subscriber to this type, returning a WithDispatch wrapper. Read more
Source§

fn with_current_subscriber(self) -> WithDispatch<Self>

Attaches the current default Subscriber to this type, returning a WithDispatch wrapper. Read more
Source§

impl<T> ErasedDestructor for T
where T: 'static,

Source§

impl<T> MaybeSendSync for T