[−][src]Trait rust_tokenizers::vocab::Vocab
Base Vocab trait
Defines a common interface to the vocabularies for use in the tokenizers.
Required methods
fn unknown_value() -> &'static str
Associative function returning the unknown value for the vocabulary
fn get_unknown_value(&self) -> &'static str
Returns the unknown value on an instance
fn values(&self) -> &HashMap<String, i64>
Return the map of token strings to IDs
fn indices(&self) -> &HashMap<i64, String>
Return the map of token IDs to strings
fn special_values(&self) -> &HashMap<String, i64>
Return the map of token strings to IDs
fn special_indices(&self) -> &HashMap<i64, String>
Return the map of token IDs to strings for special values
fn from_file(path: &str) -> Result<Self, TokenizerError> where
Self: Sized,
Self: Sized,
Read a vocabulary from file
Example
use rust_tokenizers::vocab::{BertVocab, Vocab}; let path = "path/to/file"; let base_vocab = BertVocab::from_file(path);
fn token_to_id(&self, token: &str) -> i64
Converts a token to an id.
Parameters
- token (
&str
): token to convert
Returns
i64
: token index for the value provided. If not found in the indices, returns the unknown token index
fn id_to_token(&self, id: &i64) -> String
Converts an id to a token.
Parameters
- id (
&i64
): token id to convert
Returns
String
: token value for the index provided. If not found in the indices, returns the unknown token value
Provided methods
fn read_vocab_file(path: &str) -> Result<HashMap<String, i64>, TokenizerError>
Read a Bert-style vocab.txt file (single column, one token per line)
The from_file
method should be preferred, and needs to be implemented by the specific vocabularies
fn _token_to_id(
&self,
token: &str,
values: &HashMap<String, i64>,
special_values: &HashMap<String, i64>,
unknown_value: &str
) -> i64
&self,
token: &str,
values: &HashMap<String, i64>,
special_values: &HashMap<String, i64>,
unknown_value: &str
) -> i64
Converts a token to an id, provided a HashMap
of values, a HashMap
of special values and
the unknown value token string representation. This is not meant to be directly used, the method
token_to_id
offers a more convenient interface for most vocabularies, but needs to be implemented
by the specific vocabulary.
Parameters
- token (
&str
): token to convert - values (
&HashMap<String, i64>
): mapping from tokens to ids - special_values (
&HashMap<String, i64>
): mapping from special tokens to ids - unknown_value (
&str
): unknown token value
Returns
i64
: index value for the provided token
fn _id_to_token(
&self,
id: &i64,
indices: &HashMap<i64, String>,
special_indices: &HashMap<i64, String>,
unknown_value: &str
) -> String
&self,
id: &i64,
indices: &HashMap<i64, String>,
special_indices: &HashMap<i64, String>,
unknown_value: &str
) -> String
Converts an id to a token, provided a HashMap
of values, a HashMap
of special values and
the unknown value token string representation. This is not meant to be directly used, the method
id_to_token
offers a more convenient interface for most vocabularies, but needs to be implemented
by the specific vocabulary.
Parameters
- id (
&i64
): token id to convert - indices (
&HashMap<i64, String>
): mapping from tokens to ids - special_indices (
&HashMap<i64, String>
): mapping from special tokens to ids - unknown_value (
&str
): unknown token value
Returns
String
: token value for the index provided. If not found in the indices, returns the unknown token value
fn _register_as_special_value(
token: &str,
values: &HashMap<String, i64>,
special_values: &mut HashMap<String, i64>
) -> Result<(), TokenizerError>
token: &str,
values: &HashMap<String, i64>,
special_values: &mut HashMap<String, i64>
) -> Result<(), TokenizerError>
Register a token as a special value
Parameters
- token (
&str
): token to register as a special value - values (
&HashMap<String, i64>
): mapping from tokens to ids. This should contain the token to add and will be used to read the id for registration inspecial_values
- special_values (
&HashMap<String, i64>
): mapping from special tokens to ids
fn convert_tokens_to_ids(&self, tokens: &[&str]) -> Vec<i64>
Converts a list of tokens to a list of indices.
Parameters
- tokens (
&[&str]
): list of tokens to convert
Returns
Vec<i64>
: Vector containing the indices for the tokens provided
Implementors
impl Vocab for AlbertVocab
[src]
fn unknown_value() -> &'static str
[src]
fn get_unknown_value(&self) -> &'static str
[src]
fn values(&self) -> &HashMap<String, i64>
[src]
fn indices(&self) -> &HashMap<i64, String>
[src]
fn special_values(&self) -> &HashMap<String, i64>
[src]
fn special_indices(&self) -> &HashMap<i64, String>
[src]
fn from_file(path: &str) -> Result<AlbertVocab, TokenizerError>
[src]
fn token_to_id(&self, token: &str) -> i64
[src]
fn id_to_token(&self, id: &i64) -> String
[src]
impl Vocab for BaseVocab
[src]
fn unknown_value() -> &'static str
[src]
fn get_unknown_value(&self) -> &'static str
[src]
fn values(&self) -> &HashMap<String, i64>
[src]
fn indices(&self) -> &HashMap<i64, String>
[src]
fn special_values(&self) -> &HashMap<String, i64>
[src]
fn special_indices(&self) -> &HashMap<i64, String>
[src]
fn from_file(path: &str) -> Result<BaseVocab, TokenizerError>
[src]
fn token_to_id(&self, token: &str) -> i64
[src]
fn id_to_token(&self, id: &i64) -> String
[src]
impl Vocab for BertVocab
[src]
fn unknown_value() -> &'static str
[src]
fn get_unknown_value(&self) -> &'static str
[src]
fn values(&self) -> &HashMap<String, i64>
[src]
fn indices(&self) -> &HashMap<i64, String>
[src]
fn special_values(&self) -> &HashMap<String, i64>
[src]
fn special_indices(&self) -> &HashMap<i64, String>
[src]
fn from_file(path: &str) -> Result<BertVocab, TokenizerError>
[src]
fn token_to_id(&self, token: &str) -> i64
[src]
fn id_to_token(&self, id: &i64) -> String
[src]
impl Vocab for Gpt2Vocab
[src]
fn unknown_value() -> &'static str
[src]
fn get_unknown_value(&self) -> &'static str
[src]
fn values(&self) -> &HashMap<String, i64>
[src]
fn indices(&self) -> &HashMap<i64, String>
[src]
fn special_values(&self) -> &HashMap<String, i64>
[src]
fn special_indices(&self) -> &HashMap<i64, String>
[src]
fn from_file(path: &str) -> Result<Gpt2Vocab, TokenizerError>
[src]
fn token_to_id(&self, token: &str) -> i64
[src]
fn id_to_token(&self, id: &i64) -> String
[src]
impl Vocab for MarianVocab
[src]
fn unknown_value() -> &'static str
[src]
fn get_unknown_value(&self) -> &'static str
[src]
fn values(&self) -> &HashMap<String, i64>
[src]
fn indices(&self) -> &HashMap<i64, String>
[src]
fn special_values(&self) -> &HashMap<String, i64>
[src]
fn special_indices(&self) -> &HashMap<i64, String>
[src]
fn from_file(path: &str) -> Result<MarianVocab, TokenizerError>
[src]
fn token_to_id(&self, token: &str) -> i64
[src]
fn id_to_token(&self, id: &i64) -> String
[src]
impl Vocab for OpenAiGptVocab
[src]
fn unknown_value() -> &'static str
[src]
fn get_unknown_value(&self) -> &'static str
[src]
fn values(&self) -> &HashMap<String, i64>
[src]
fn indices(&self) -> &HashMap<i64, String>
[src]
fn special_values(&self) -> &HashMap<String, i64>
[src]
fn special_indices(&self) -> &HashMap<i64, String>
[src]
fn from_file(path: &str) -> Result<OpenAiGptVocab, TokenizerError>
[src]
fn token_to_id(&self, token: &str) -> i64
[src]
fn id_to_token(&self, id: &i64) -> String
[src]
impl Vocab for RobertaVocab
[src]
fn unknown_value() -> &'static str
[src]
fn get_unknown_value(&self) -> &'static str
[src]
fn values(&self) -> &HashMap<String, i64>
[src]
fn indices(&self) -> &HashMap<i64, String>
[src]
fn special_values(&self) -> &HashMap<String, i64>
[src]
fn special_indices(&self) -> &HashMap<i64, String>
[src]
fn from_file(path: &str) -> Result<RobertaVocab, TokenizerError>
[src]
Read a Roberta-style vocab.json file