Trait tokenizers::tokenizer::Model

source ·
pub trait Model {
    type Trainer: Trainer + Sync;

    // Required methods
    fn tokenize(&self, sequence: &str) -> Result<Vec<Token>>;
    fn token_to_id(&self, token: &str) -> Option<u32>;
    fn id_to_token(&self, id: u32) -> Option<String>;
    fn get_vocab(&self) -> HashMap<String, u32>;
    fn get_vocab_size(&self) -> usize;
    fn save(&self, folder: &Path, prefix: Option<&str>) -> Result<Vec<PathBuf>>;
    fn get_trainer(&self) -> <Self as Model>::Trainer;
}
Expand description

Represents a model used during Tokenization (like BPE or Word or Unigram).

Required Associated Types§

Required Methods§

source

fn tokenize(&self, sequence: &str) -> Result<Vec<Token>>

Tokenize the given sequence into multiple underlying Token. The offsets on the Token are expected to be relative to the given sequence.

source

fn token_to_id(&self, token: &str) -> Option<u32>

Find the ID associated to a string token

source

fn id_to_token(&self, id: u32) -> Option<String>

Find the string token associated to an ID

source

fn get_vocab(&self) -> HashMap<String, u32>

Retrieve the entire vocabulary mapping (token -> ID)

source

fn get_vocab_size(&self) -> usize

Retrieve the size of the vocabulary

source

fn save(&self, folder: &Path, prefix: Option<&str>) -> Result<Vec<PathBuf>>

Save the current Model in the given folder, using the given prefix for the various files that need to be saved.

source

fn get_trainer(&self) -> <Self as Model>::Trainer

Get an instance of a Trainer capable of training this Model

Implementors§