pub struct AlbertVocab {
    pub values: HashMap<String, i64>,
    pub indices: HashMap<i64, String>,
    pub special_token_map: SpecialTokenMap,
    pub special_values: HashMap<String, i64>,
    pub special_indices: HashMap<i64, String>,
}
Expand description

AlbertVocab

Vocabulary for ALBERT tokenizer. Contains the following special values:

  • BOS token
  • EOS token
  • CLS token
  • SEP token
  • PAD token
  • MASK token

Expects a SentencePiece protobuf file when created from file.

Fields§

§values: HashMap<String, i64>

A mapping of tokens as string to indices (i.e. the encoder base)

§indices: HashMap<i64, String>

A mapping of token ids to strings (i.e. the decoder base)

§special_token_map: SpecialTokenMap

Special tokens used by the vocabulary

§special_values: HashMap<String, i64>

A mapping of special value tokens as strings to IDs (i.e. the encoder base for special values), special values typically include things like BOS/EOS markers, class markers, mask markers and padding markers

§special_indices: HashMap<i64, String>

A mapping of special value tokens as IDs to strings (i.e. the decoder base for special values)

Implementations§

source§

impl AlbertVocab

source

pub fn get_pad_value(&self) -> &str

source

pub fn get_bos_value(&self) -> &str

source

pub fn get_sep_value(&self) -> &str

source

pub fn get_cls_value(&self) -> &str

source

pub fn get_eos_value(&self) -> &str

source

pub fn get_mask_value(&self) -> &str

Trait Implementations§

source§

impl Clone for AlbertVocab

source§

fn clone(&self) -> AlbertVocab

Returns a copy of the value. Read more
1.0.0 · source§

fn clone_from(&mut self, source: &Self)

Performs copy-assignment from source. Read more
source§

impl Debug for AlbertVocab

source§

fn fmt(&self, f: &mut Formatter<'_>) -> Result

Formats the value using the given formatter. Read more
source§

impl MultiThreadedTokenizer<AlbertVocab> for AlbertTokenizer

source§

fn vocab(&self) -> &T

returns a reference to the tokenizer vocabulary
source§

fn tokenize_list_with_offsets<S>( &self, text_list: &[S] ) -> Vec<TokensWithOffsets>where S: AsRef<str> + Sync,

Tokenize a list of strings (with multithreading), where each corresponds to for example a sentence, returns a vector of TokensWithOffsets containing the tokens and their offset information. This calls tokenize_with_offsets on the list provided. Read more
source§

fn tokenize_list<S>(&self, text_list: &[S]) -> Vec<Vec<String>>where S: AsRef<str> + Sync,

Multithreaded tokenization of a list of strings, returning tokens with offset information Read more
source§

fn encode_list<S>( &self, text_list: &[S], max_len: usize, truncation_strategy: &TruncationStrategy, stride: usize ) -> Vec<TokenizedInput>where S: AsRef<str> + Sync,

Multithreaded encoding of a sequence of string-like texts (tokenization followed by encoding). Not that in contrast with encode optional second text, each text provided is encoded independently. Read more
source§

fn encode_pair_list<S>( &self, text_list: &[(S, S)], max_len: usize, truncation_strategy: &TruncationStrategy, stride: usize ) -> Vec<TokenizedInput>where S: AsRef<str> + Sync,

Multithreaded ncoding of a sequence of string-like text pairs (tokenization followed by encoding). This combines with encode with the list processing of encode_list. Read more
source§

fn decode_list( &self, token_ids_list: &[Vec<i64>], skip_special_tokens: bool, clean_up_tokenization_spaces: bool ) -> Vec<String>

Multithreaded conversion a list of sequence of ids (integer) into a string, using the tokenizer and vocabulary with options to remove special tokens and clean up tokenization spaces. This calls decode for each provided sequence of ids Read more
source§

impl Tokenizer<AlbertVocab> for AlbertTokenizer

source§

fn vocab(&self) -> &AlbertVocab

returns a reference to the tokenizer vocabulary
source§

fn vocab_mut(&mut self) -> &mut AlbertVocab

returns a mutable reference to the tokenizer vocabulary
source§

fn tokenize_to_tokens(&self, text: TokenRef<'_>) -> Vec<Token>

Tokenize a TokenRef, returning a sequence of tokens Read more
source§

fn convert_tokens_to_string(&self, tokens: Vec<String>) -> String

Converts a sequence of strings into a single string. This will clean-up artifacts from tokenization (for example sub ##word) and generate a single output string Read more
source§

fn build_input_with_special_tokens( &self, tokens_ids_with_offsets_1: TokenIdsWithOffsets, tokens_ids_with_offsets_2: Option<TokenIdsWithOffsets> ) -> TokenIdsWithSpecialTokens

Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and adding special tokens. Read more
source§

fn tokenize(&self, text: &str) -> Vec<String>

Tokenize a string, returns a vector of tokens as strings. Use tokenize_with_offsets or tokenize_to_tokens to return offset information. Read more
source§

fn tokenize_with_offsets(&self, text: &str) -> TokensWithOffsets

Tokenize a string, returning tokens with offset information Read more
source§

fn tokenize_list<S>(&self, text_list: &[S]) -> Vec<Vec<String>>where S: AsRef<str>,

Tokenize a list of strings, returning tokens with offset information Read more
source§

fn tokenize_list_with_offsets<S>( &self, text_list: &[S] ) -> Vec<TokensWithOffsets>where S: AsRef<str>,

Tokenize a list of strings, where each corresponds to for example a sentence, returns a vector of TokensWithOffsets containing the tokens and their offset information. This calls tokenize_with_offsets on the list provided. Read more
source§

fn convert_tokens_to_ids<S>(&self, tokens: &[S]) -> Vec<i64>where S: AsRef<str>,

Convert a slice of string-like to a vector ot token indices Read more
source§

fn encode( &self, text_1: &str, text_2: Option<&str>, max_len: usize, truncation_strategy: &TruncationStrategy, stride: usize ) -> TokenizedInput

Encode a string-like (tokenization followed by encoding) Read more
source§

fn encode_list<S>( &self, text_list: &[S], max_len: usize, truncation_strategy: &TruncationStrategy, stride: usize ) -> Vec<TokenizedInput>where S: AsRef<str>,

Encode a sequence of string-like texts (tokenization followed by encoding). Not that in contrast with encode optional second text, each text provided is encoded independently. Read more
source§

fn encode_pair_list<S>( &self, text_list: &[(S, S)], max_len: usize, truncation_strategy: &TruncationStrategy, stride: usize ) -> Vec<TokenizedInput>where S: AsRef<str>,

Encode a sequence of string-like text pairs (tokenization followed by encoding). This combines with encode with the list processing of encode_list. Read more
source§

fn decode_to_vec( &self, token_ids: &[i64], skip_special_tokens: bool ) -> Vec<String>

Decode a sequence of token indices to a sequence of Strings, optionally skipping special indices Read more
source§

fn decode( &self, token_ids: &[i64], skip_special_tokens: bool, clean_up_tokenization_spaces: bool ) -> String

Converts a sequence of ids (integer) into a string, using the tokenizer and vocabulary with options to remove special tokens and clean up tokenization spaces. Read more
source§

fn clean_up_tokenization(&self, input_string: String) -> String

Cleans-up tokenization artifacts (for example whitespace before punctuation) Read more
source§

fn decode_list( &self, token_ids_list: &[Vec<i64>], skip_special_tokens: bool, clean_up_tokenization_spaces: bool ) -> Vec<String>

Converts a list of sequence of ids (integer) into a string, using the tokenizer and vocabulary with options to remove special tokens and clean up tokenization spaces. This calls decode for each provided sequence of ids Read more
source§

fn add_tokens(&mut self, tokens: &[&str])

Add arbitrary tokens to the vocabulary. Read more
source§

fn add_extra_ids(&mut self, num_extra_ids: i64)

Add arbitrary tokens to the vocabulary. Read more
source§

impl Vocab for AlbertVocab

source§

fn get_unknown_value(&self) -> &str

Returns the unknown value on an instance
source§

fn values(&self) -> &HashMap<String, i64>

Return the map of token strings to IDs
source§

fn indices(&self) -> &HashMap<i64, String>

Return the map of token IDs to strings
source§

fn special_values(&self) -> &HashMap<String, i64>

Return the map of token strings to IDs
source§

fn special_indices(&self) -> &HashMap<i64, String>

Return the map of token IDs to strings for special values
source§

fn values_mut(&mut self) -> &mut HashMap<String, i64>

Return a mutable reference to the map of token strings to IDs
source§

fn indices_mut(&mut self) -> &mut HashMap<i64, String>

Return a mutable reference to the map of token IDs to strings
source§

fn special_values_mut(&mut self) -> &mut HashMap<String, i64>

Return a mutable reference to the map of token strings to IDs
source§

fn special_indices_mut(&mut self) -> &mut HashMap<i64, String>

Return a mutable reference to the map of token IDs to strings for special values
source§

fn from_file<P: AsRef<Path>>(path: P) -> Result<AlbertVocab, TokenizerError>

Read a vocabulary from file Read more
source§

fn from_file_with_special_token_mapping<P: AsRef<Path>, S: AsRef<Path>>( path: P, special_token_mapping_path: S ) -> Result<Self, TokenizerError>

Read a vocabulary from file with special token mapping Read more
source§

fn from_values_and_special_token_map( values: HashMap<String, i64>, special_token_map: SpecialTokenMap ) -> Result<Self, TokenizerError>where Self: Sized,

source§

fn token_to_id(&self, token: &str) -> i64

Converts a token to an id. Read more
source§

fn id_to_token(&self, id: &i64) -> String

Converts an id to a token. Read more
source§

fn _token_to_id( &self, token: &str, values: &HashMap<String, i64>, special_values: &HashMap<String, i64>, unknown_value: &str ) -> i64

Converts a token to an id, provided a HashMap of values, a HashMap of special values and the unknown value token string representation. This is not meant to be directly used, the method token_to_id offers a more convenient interface for most vocabularies, but needs to be implemented by the specific vocabulary. Read more
source§

fn _id_to_token( &self, id: &i64, indices: &HashMap<i64, String>, special_indices: &HashMap<i64, String>, unknown_value: &str ) -> String

Converts an id to a token, provided a HashMap of values, a HashMap of special values and the unknown value token string representation. This is not meant to be directly used, the method id_to_token offers a more convenient interface for most vocabularies, but needs to be implemented by the specific vocabulary. Read more
source§

fn convert_tokens_to_ids(&self, tokens: &[&str]) -> Vec<i64>

Converts a list of tokens to a list of indices. Read more
source§

fn add_extra_ids(&mut self, num_extra_ids: i64)

Add extra token ids to the vocab Read more
source§

fn add_tokens(&mut self, tokens: &[&str])

Add arbitrary tokens to the vocabulary. Read more

Auto Trait Implementations§

Blanket Implementations§

source§

impl<T> Any for Twhere T: 'static + ?Sized,

source§

fn type_id(&self) -> TypeId

Gets the TypeId of self. Read more
source§

impl<T> Borrow<T> for Twhere T: ?Sized,

source§

fn borrow(&self) -> &T

Immutably borrows from an owned value. Read more
source§

impl<T> BorrowMut<T> for Twhere T: ?Sized,

source§

fn borrow_mut(&mut self) -> &mut T

Mutably borrows from an owned value. Read more
source§

impl<T> From<T> for T

source§

fn from(t: T) -> T

Returns the argument unchanged.

source§

impl<T, U> Into<U> for Twhere U: From<T>,

source§

fn into(self) -> U

Calls U::from(self).

That is, this conversion is whatever the implementation of From<T> for U chooses to do.

§

impl<T> Pointable for T

§

const ALIGN: usize = _

The alignment of pointer.
§

type Init = T

The type for initializers.
§

unsafe fn init(init: <T as Pointable>::Init) -> usize

Initializes a with the given initializer. Read more
§

unsafe fn deref<'a>(ptr: usize) -> &'a T

Dereferences the given pointer. Read more
§

unsafe fn deref_mut<'a>(ptr: usize) -> &'a mut T

Mutably dereferences the given pointer. Read more
§

unsafe fn drop(ptr: usize)

Drops the object pointed to by the given pointer. Read more
source§

impl<T> ToOwned for Twhere T: Clone,

§

type Owned = T

The resulting type after obtaining ownership.
source§

fn to_owned(&self) -> T

Creates owned data from borrowed data, usually by cloning. Read more
source§

fn clone_into(&self, target: &mut T)

Uses borrowed data to replace owned data, usually by cloning. Read more
source§

impl<T, U> TryFrom<U> for Twhere U: Into<T>,

§

type Error = Infallible

The type returned in the event of a conversion error.
source§

fn try_from(value: U) -> Result<T, <T as TryFrom<U>>::Error>

Performs the conversion.
source§

impl<T, U> TryInto<U> for Twhere U: TryFrom<T>,

§

type Error = <U as TryFrom<T>>::Error

The type returned in the event of a conversion error.
source§

fn try_into(self) -> Result<U, <U as TryFrom<T>>::Error>

Performs the conversion.