Struct rust_tokenizers::vocab::RobertaVocab

source ·

pub struct RobertaVocab {
    pub values: HashMap<String, i64>,
    pub indices: HashMap<i64, String>,
    pub special_token_map: SpecialTokenMap,
    pub special_values: HashMap<String, i64>,
    pub special_indices: HashMap<i64, String>,
}

Expand description

RoBERTa Vocab

Vocabulary for RoBERTa tokenizer. Contains the following special values:

PAD token
BOS token
EOS token
SEP token
MASK token
CLS token

Expects a JSON-format vocabulary when created from file.

Fields§

§values: HashMap<String, i64>

A mapping of tokens as string to indices (i.e. the encoder base)

§indices: HashMap<i64, String>

A mapping of token IDs to strings (i.e. the decoder base)

§special_token_map: SpecialTokenMap

Special tokens used by the vocabulary

§special_values: HashMap<String, i64>

A mapping of special value tokens as strings to IDs (i.e. the encoder base for special values), special values typically include things like BOS/EOS markers, class markers, mask markers and padding markers

§special_indices: HashMap<i64, String>

A mapping of special value tokens as IDs to strings (i.e. the decoder base for special values)

Implementations§

source §

pub fn get_mask_value(&self) -> &str

Trait Implementations§

source §

impl Clone for RobertaVocab

source §

fn clone(&self) -> RobertaVocab

Returns a copy of the value. Read more

1.0.0 · source§

fn clone_from(&mut self, source: &Self)

Performs copy-assignment from source. Read more

source §

impl Debug for RobertaVocab

source §

fn fmt(&self, f: &mut Formatter<'_>) -> Result

Formats the value using the given formatter. Read more

source §

impl MultiThreadedTokenizer<RobertaVocab> for RobertaTokenizer

source §

fn vocab(&self) -> &T

returns a reference to the tokenizer vocabulary

source §

fn tokenize_list_with_offsets<S>( &self, text_list: &[S] ) -> Vec<TokensWithOffsets>where S: AsRef<str> + Sync,

Tokenize a list of strings (with multithreading), where each corresponds to for example a sentence, returns a vector of TokensWithOffsets containing the tokens and their offset information. This calls tokenize_with_offsets on the list provided. Read more

source §

fn tokenize_list<S>(&self, text_list: &[S]) -> Vec<Vec<String>>where S: AsRef<str> + Sync,

Multithreaded tokenization of a list of strings, returning tokens with offset information Read more

source §

fn encode_list<S>( &self, text_list: &[S], max_len: usize, truncation_strategy: &TruncationStrategy, stride: usize ) -> Vec<TokenizedInput>where S: AsRef<str> + Sync,

Multithreaded encoding of a sequence of string-like texts (tokenization followed by encoding). Not that in contrast with encode optional second text, each text provided is encoded independently. Read more

source §

fn encode_pair_list<S>( &self, text_list: &[(S, S)], max_len: usize, truncation_strategy: &TruncationStrategy, stride: usize ) -> Vec<TokenizedInput>where S: AsRef<str> + Sync,

Multithreaded ncoding of a sequence of string-like text pairs (tokenization followed by encoding). This combines with encode with the list processing of encode_list. Read more

source §

fn decode_list( &self, token_ids_list: &[Vec<i64>], skip_special_tokens: bool, clean_up_tokenization_spaces: bool ) -> Vec<String>

Multithreaded conversion a list of sequence of ids (integer) into a string, using the tokenizer and vocabulary with options to remove special tokens and clean up tokenization spaces. This calls decode for each provided sequence of ids Read more

source §

impl Tokenizer<RobertaVocab> for RobertaTokenizer

source §

fn vocab(&self) -> &RobertaVocab

returns a reference to the tokenizer vocabulary

source §

fn vocab_mut(&mut self) -> &mut RobertaVocab

returns a mutable reference to the tokenizer vocabulary

source §

fn tokenize_to_tokens(&self, initial_token: TokenRef<'_>) -> Vec<Token>

Tokenize a TokenRef, returning a sequence of tokens Read more

source §

fn convert_tokens_to_string(&self, tokens: Vec<String>) -> String

Converts a sequence of strings into a single string. This will clean-up artifacts from tokenization (for example sub ##word) and generate a single output string Read more

source §

fn build_input_with_special_tokens( &self, tokens_ids_with_offsets_1: TokenIdsWithOffsets, tokens_ids_with_offsets_2: Option<TokenIdsWithOffsets> ) -> TokenIdsWithSpecialTokens

Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and adding special tokens. Read more

source §

fn tokenize(&self, text: &str) -> Vec<String>

Tokenize a string, returns a vector of tokens as strings. Use tokenize_with_offsets or tokenize_to_tokens to return offset information. Read more

source §

fn tokenize_with_offsets(&self, text: &str) -> TokensWithOffsets

Tokenize a string, returning tokens with offset information Read more

source §

fn tokenize_list<S>(&self, text_list: &[S]) -> Vec<Vec<String>>where S: AsRef<str>,

Tokenize a list of strings, returning tokens with offset information Read more

source §

fn tokenize_list_with_offsets<S>( &self, text_list: &[S] ) -> Vec<TokensWithOffsets>where S: AsRef<str>,

Tokenize a list of strings, where each corresponds to for example a sentence, returns a vector of TokensWithOffsets containing the tokens and their offset information. This calls tokenize_with_offsets on the list provided. Read more

source §

fn convert_tokens_to_ids<S>(&self, tokens: &[S]) -> Vec<i64>where S: AsRef<str>,

Convert a slice of string-like to a vector ot token indices Read more

source §

fn encode( &self, text_1: &str, text_2: Option<&str>, max_len: usize, truncation_strategy: &TruncationStrategy, stride: usize ) -> TokenizedInput

Encode a string-like (tokenization followed by encoding) Read more

source §

fn encode_list<S>( &self, text_list: &[S], max_len: usize, truncation_strategy: &TruncationStrategy, stride: usize ) -> Vec<TokenizedInput>where S: AsRef<str>,

Encode a sequence of string-like texts (tokenization followed by encoding). Not that in contrast with encode optional second text, each text provided is encoded independently. Read more

source §

fn encode_pair_list<S>( &self, text_list: &[(S, S)], max_len: usize, truncation_strategy: &TruncationStrategy, stride: usize ) -> Vec<TokenizedInput>where S: AsRef<str>,

Encode a sequence of string-like text pairs (tokenization followed by encoding). This combines with encode with the list processing of encode_list. Read more

source §

fn decode_to_vec( &self, token_ids: &[i64], skip_special_tokens: bool ) -> Vec<String>

Decode a sequence of token indices to a sequence of Strings, optionally skipping special indices Read more

source §

fn decode( &self, token_ids: &[i64], skip_special_tokens: bool, clean_up_tokenization_spaces: bool ) -> String

Converts a sequence of ids (integer) into a string, using the tokenizer and vocabulary with options to remove special tokens and clean up tokenization spaces. Read more

source §

fn clean_up_tokenization(&self, input_string: String) -> String

Cleans-up tokenization artifacts (for example whitespace before punctuation) Read more

source §

fn decode_list( &self, token_ids_list: &[Vec<i64>], skip_special_tokens: bool, clean_up_tokenization_spaces: bool ) -> Vec<String>

Converts a list of sequence of ids (integer) into a string, using the tokenizer and vocabulary with options to remove special tokens and clean up tokenization spaces. This calls decode for each provided sequence of ids Read more

source §

fn add_tokens(&mut self, tokens: &[&str])

Add arbitrary tokens to the vocabulary. Read more

source §

fn add_extra_ids(&mut self, num_extra_ids: i64)

Add arbitrary tokens to the vocabulary. Read more

source §

impl Vocab for RobertaVocab

source §

fn from_file<P: AsRef<Path>>(path: P) -> Result<RobertaVocab, TokenizerError>

Read a Roberta-style vocab.json file

source §

fn get_unknown_value(&self) -> &str

Returns the unknown value on an instance

source §

fn values(&self) -> &HashMap<String, i64>

Return the map of token strings to IDs

source §

fn indices(&self) -> &HashMap<i64, String>

Return the map of token IDs to strings

source §

fn special_values(&self) -> &HashMap<String, i64>

Return the map of token strings to IDs

source §

fn special_indices(&self) -> &HashMap<i64, String>

Return the map of token IDs to strings for special values

source §

fn values_mut(&mut self) -> &mut HashMap<String, i64>

Return a mutable reference to the map of token strings to IDs

source §

fn indices_mut(&mut self) -> &mut HashMap<i64, String>

Return a mutable reference to the map of token IDs to strings

source §

fn special_values_mut(&mut self) -> &mut HashMap<String, i64>

Return a mutable reference to the map of token strings to IDs

source §

fn special_indices_mut(&mut self) -> &mut HashMap<i64, String>

Return a mutable reference to the map of token IDs to strings for special values

source §

fn from_file_with_special_token_mapping<P: AsRef<Path>, S: AsRef<Path>>( path: P, special_token_mapping_path: S ) -> Result<Self, TokenizerError>

Read a vocabulary from file with special token mapping Read more

source §

fn from_values_and_special_token_map( values: HashMap<String, i64>, special_token_map: SpecialTokenMap ) -> Result<Self, TokenizerError>where Self: Sized,

source §

fn token_to_id(&self, token: &str) -> i64

Converts a token to an id. Read more

source §

fn id_to_token(&self, id: &i64) -> String

Converts an id to a token. Read more

source §

fn _token_to_id( &self, token: &str, values: &HashMap<String, i64>, special_values: &HashMap<String, i64>, unknown_value: &str ) -> i64

Converts a token to an id, provided a HashMap of values, a HashMap of special values and the unknown value token string representation. This is not meant to be directly used, the method token_to_id offers a more convenient interface for most vocabularies, but needs to be implemented by the specific vocabulary. Read more

source §

fn _id_to_token( &self, id: &i64, indices: &HashMap<i64, String>, special_indices: &HashMap<i64, String>, unknown_value: &str ) -> String

Converts an id to a token, provided a HashMap of values, a HashMap of special values and the unknown value token string representation. This is not meant to be directly used, the method id_to_token offers a more convenient interface for most vocabularies, but needs to be implemented by the specific vocabulary. Read more

source §