[][src]Trait rust_tokenizers::vocab::Vocab

pub trait Vocab {
    fn unknown_value() -> &'static str;
fn get_unknown_value(&self) -> &'static str;
fn values(&self) -> &HashMap<String, i64>;
fn indices(&self) -> &HashMap<i64, String>;
fn special_values(&self) -> &HashMap<String, i64>;
fn special_indices(&self) -> &HashMap<i64, String>;
fn from_file(path: &str) -> Result<Self, TokenizerError>
    where
        Self: Sized
;
fn token_to_id(&self, token: &str) -> i64;
fn id_to_token(&self, id: &i64) -> String; fn read_vocab_file(
        path: &str
    ) -> Result<HashMap<String, i64>, TokenizerError> { ... }
fn _token_to_id(
        &self,
        token: &str,
        values: &HashMap<String, i64>,
        special_values: &HashMap<String, i64>,
        unknown_value: &str
    ) -> i64 { ... }
fn _id_to_token(
        &self,
        id: &i64,
        indices: &HashMap<i64, String>,
        special_indices: &HashMap<i64, String>,
        unknown_value: &str
    ) -> String { ... }
fn _register_as_special_value(
        token: &str,
        values: &HashMap<String, i64>,
        special_values: &mut HashMap<String, i64>
    ) -> Result<(), TokenizerError> { ... }
fn convert_tokens_to_ids(&self, tokens: &[&str]) -> Vec<i64> { ... } }

Base Vocab trait

Defines a common interface to the vocabularies for use in the tokenizers.

Required methods

fn unknown_value() -> &'static str

Associative function returning the unknown value for the vocabulary

fn get_unknown_value(&self) -> &'static str

Returns the unknown value on an instance

fn values(&self) -> &HashMap<String, i64>

Return the map of token strings to IDs

fn indices(&self) -> &HashMap<i64, String>

Return the map of token IDs to strings

fn special_values(&self) -> &HashMap<String, i64>

Return the map of token strings to IDs

fn special_indices(&self) -> &HashMap<i64, String>

Return the map of token IDs to strings for special values

fn from_file(path: &str) -> Result<Self, TokenizerError> where
    Self: Sized

Read a vocabulary from file

Example

use rust_tokenizers::vocab::{BertVocab, Vocab};
let path = "path/to/file";

let base_vocab = BertVocab::from_file(path);

fn token_to_id(&self, token: &str) -> i64

Converts a token to an id.

Parameters

  • token (&str): token to convert

Returns

  • i64: token index for the value provided. If not found in the indices, returns the unknown token index

fn id_to_token(&self, id: &i64) -> String

Converts an id to a token.

Parameters

  • id (&i64): token id to convert

Returns

  • String: token value for the index provided. If not found in the indices, returns the unknown token value
Loading content...

Provided methods

fn read_vocab_file(path: &str) -> Result<HashMap<String, i64>, TokenizerError>

Read a Bert-style vocab.txt file (single column, one token per line) The from_file method should be preferred, and needs to be implemented by the specific vocabularies

fn _token_to_id(
    &self,
    token: &str,
    values: &HashMap<String, i64>,
    special_values: &HashMap<String, i64>,
    unknown_value: &str
) -> i64

Converts a token to an id, provided a HashMap of values, a HashMap of special values and the unknown value token string representation. This is not meant to be directly used, the method token_to_id offers a more convenient interface for most vocabularies, but needs to be implemented by the specific vocabulary.

Parameters

  • token (&str): token to convert
  • values (&HashMap<String, i64>): mapping from tokens to ids
  • special_values (&HashMap<String, i64>): mapping from special tokens to ids
  • unknown_value (&str): unknown token value

Returns

  • i64: index value for the provided token

fn _id_to_token(
    &self,
    id: &i64,
    indices: &HashMap<i64, String>,
    special_indices: &HashMap<i64, String>,
    unknown_value: &str
) -> String

Converts an id to a token, provided a HashMap of values, a HashMap of special values and the unknown value token string representation. This is not meant to be directly used, the method id_to_token offers a more convenient interface for most vocabularies, but needs to be implemented by the specific vocabulary.

Parameters

  • id (&i64): token id to convert
  • indices (&HashMap<i64, String>): mapping from tokens to ids
  • special_indices (&HashMap<i64, String>): mapping from special tokens to ids
  • unknown_value (&str): unknown token value

Returns

  • String: token value for the index provided. If not found in the indices, returns the unknown token value

fn _register_as_special_value(
    token: &str,
    values: &HashMap<String, i64>,
    special_values: &mut HashMap<String, i64>
) -> Result<(), TokenizerError>

Register a token as a special value

Parameters

  • token (&str): token to register as a special value
  • values (&HashMap<String, i64>): mapping from tokens to ids. This should contain the token to add and will be used to read the id for registration in special_values
  • special_values (&HashMap<String, i64>): mapping from special tokens to ids

fn convert_tokens_to_ids(&self, tokens: &[&str]) -> Vec<i64>

Converts a list of tokens to a list of indices.

Parameters

  • tokens (&[&str]): list of tokens to convert

Returns

  • Vec<i64>: Vector containing the indices for the tokens provided
Loading content...

Implementors

impl Vocab for AlbertVocab[src]

impl Vocab for BaseVocab[src]

impl Vocab for BertVocab[src]

impl Vocab for Gpt2Vocab[src]

impl Vocab for MarianVocab[src]

impl Vocab for OpenAiGptVocab[src]

impl Vocab for RobertaVocab[src]

fn from_file(path: &str) -> Result<RobertaVocab, TokenizerError>[src]

Read a Roberta-style vocab.json file

impl Vocab for SentencePieceVocab[src]

impl Vocab for T5Vocab[src]

impl Vocab for XLMRobertaVocab[src]

impl Vocab for XLNetVocab[src]

Loading content...