Trait tokenizers::tokenizer::Model

pub trait Model {
    type Trainer: Trainer + Sync;

    // Required methods
    fn tokenize(&self, sequence: &str) -> Result<Vec<Token>>;
    fn token_to_id(&self, token: &str) -> Option<u32>;
    fn id_to_token(&self, id: u32) -> Option<String>;
    fn get_vocab(&self) -> HashMap<String, u32>;
    fn get_vocab_size(&self) -> usize;
    fn save(&self, folder: &Path, prefix: Option<&str>) -> Result<Vec<PathBuf>>;
    fn get_trainer(&self) -> <Self as Model>::Trainer;
}

Expand description

Represents a model used during Tokenization (like BPE or Word or Unigram).

Required Associated Types§

type Trainer: Trainer + Sync

Required Methods§

fn tokenize(&self, sequence: &str) -> Result<Vec<Token>>

Tokenize the given sequence into multiple underlying Token. The offsets on the Token are expected to be relative to the given sequence.

fn token_to_id(&self, token: &str) -> Option<u32>

Find the ID associated to a string token

fn id_to_token(&self, id: u32) -> Option<String>

Find the string token associated to an ID

fn get_vocab(&self) -> HashMap<String, u32>

Retrieve the entire vocabulary mapping (token -> ID)

fn get_vocab_size(&self) -> usize

Retrieve the size of the vocabulary

fn save(&self, folder: &Path, prefix: Option<&str>) -> Result<Vec<PathBuf>>

Save the current Model in the given folder, using the given prefix for the various files that need to be saved.

fn get_trainer(&self) -> <Self as Model>::Trainer

Get an instance of a Trainer capable of training this Model

Implementors§

impl Model for ModelWrapper

type Trainer = TrainerWrapper

impl Model for BPE

type Trainer = BpeTrainer

impl Model for Unigram

type Trainer = UnigramTrainer

impl Model for WordLevel

type Trainer = WordLevelTrainer

impl Model for WordPiece

type Trainer = WordPieceTrainer