pub trait MultiThreadedTokenizer<T: Vocab>where
    Self: Sync + Send + Tokenizer<T>,{
    // Provided methods
    fn vocab(&self) -> &T { ... }
    fn tokenize_list_with_offsets<S>(
        &self,
        text_list: &[S]
    ) -> Vec<TokensWithOffsets>
       where S: AsRef<str> + Sync { ... }
    fn tokenize_list<S>(&self, text_list: &[S]) -> Vec<Vec<String>>
       where S: AsRef<str> + Sync { ... }
    fn encode_list<S>(
        &self,
        text_list: &[S],
        max_len: usize,
        truncation_strategy: &TruncationStrategy,
        stride: usize
    ) -> Vec<TokenizedInput>
       where S: AsRef<str> + Sync { ... }
    fn encode_pair_list<S>(
        &self,
        text_list: &[(S, S)],
        max_len: usize,
        truncation_strategy: &TruncationStrategy,
        stride: usize
    ) -> Vec<TokenizedInput>
       where S: AsRef<str> + Sync { ... }
    fn decode_list(
        &self,
        token_ids_list: &[Vec<i64>],
        skip_special_tokens: bool,
        clean_up_tokenization_spaces: bool
    ) -> Vec<String> { ... }
}
Expand description

Provided Methods§

source

fn vocab(&self) -> &T

returns a reference to the tokenizer vocabulary

source

fn tokenize_list_with_offsets<S>( &self, text_list: &[S] ) -> Vec<TokensWithOffsets>where S: AsRef<str> + Sync,

Tokenize a list of strings (with multithreading), where each corresponds to for example a sentence, returns a vector of TokensWithOffsets containing the tokens and their offset information. This calls tokenize_with_offsets on the list provided.

Parameters
  • text_list: list of strings to tokenize
Returns

Vec<TokensWithOffsets> with the token strings representation and offsets

Example
use rust_tokenizers::tokenizer::{BaseTokenizer, Tokenizer};
use rust_tokenizers::vocab::BaseVocab;
let strip_accents = false;
let lower_case = false;
let tokenizer: BaseTokenizer<BaseVocab> =
    BaseTokenizer::from_file("path/to/vocab/file", lower_case, strip_accents).unwrap();

let text = ["Hello, world!", "Second sentence"];
let tokens = tokenizer.tokenize_list_with_offsets(&text);
source

fn tokenize_list<S>(&self, text_list: &[S]) -> Vec<Vec<String>>where S: AsRef<str> + Sync,

Multithreaded tokenization of a list of strings, returning tokens with offset information

Parameters
  • text_list: list of strings to tokenize
Returns

Vec<Vec<String>> with the token strings representation

Example
use rust_tokenizers::tokenizer::{BaseTokenizer, MultiThreadedTokenizer};
use rust_tokenizers::vocab::BaseVocab;
let strip_accents = false;
let lower_case = false;
let tokenizer: BaseTokenizer<BaseVocab> =
    BaseTokenizer::from_file("path/to/vocab/file", lower_case, strip_accents).unwrap();

let texts = ["Hello, world!", "Second sentence"];
let tokens = tokenizer.tokenize_list(&texts);
source

fn encode_list<S>( &self, text_list: &[S], max_len: usize, truncation_strategy: &TruncationStrategy, stride: usize ) -> Vec<TokenizedInput>where S: AsRef<str> + Sync,

Multithreaded encoding of a sequence of string-like texts (tokenization followed by encoding). Not that in contrast with encode optional second text, each text provided is encoded independently.

Parameters
  • text_list: sequence of input text (&str) to encode combined into a single encoding by using the build_input_with_special_tokens method.
  • max_len (usize): maximum combined sequence length. If the combined encoding would exceed this max_len, the encoding is truncated following the TruncationStrategy provided.
  • truncation_strategy (&TruncationStrategy): strategy to follow for the truncation, if required
  • stride (usize): amount of tokens to shift the input by if truncation is required (allowing for the generation of overlapping sequences with overflowing tokens)
Returns

Vec<TokenizedInput> containing the encoding output (token indices, token types, segment ids, ovrflowing tokens and special token mask) for each provided text

Example
use rust_tokenizers::tokenizer::{BaseTokenizer, MultiThreadedTokenizer, TruncationStrategy};
use rust_tokenizers::vocab::BaseVocab;
let strip_accents = false;
let lower_case = false;
let tokenizer: BaseTokenizer<BaseVocab> =
    BaseTokenizer::from_file("path/to/vocab/file", lower_case, strip_accents).unwrap();

let text_1 = "Hello, world!";
let text_2 = "How is it going?";
let text_3 = "Very well thank you.";
let encoded_input = tokenizer.encode_list(
    &[text_1, text_2, text_3],
    5,
    &TruncationStrategy::LongestFirst,
    2,
);
source

fn encode_pair_list<S>( &self, text_list: &[(S, S)], max_len: usize, truncation_strategy: &TruncationStrategy, stride: usize ) -> Vec<TokenizedInput>where S: AsRef<str> + Sync,

Multithreaded ncoding of a sequence of string-like text pairs (tokenization followed by encoding). This combines with encode with the list processing of encode_list.

Parameters
  • text_list: sequence of input text (&str) to encode combined into a single encoding by using the build_input_with_special_tokens method.
  • max_len (usize): maximum combined sequence length. If the combined encoding would exceed this max_len, the encoding is truncated following the TruncationStrategy provided.
  • truncation_strategy (&TruncationStrategy): strategy to follow for the truncation, if required
  • stride (usize): amount of tokens to shift the input by if truncation is required (allowing for the generation of overlapping sequences with overflowing tokens)
Returns

Vec<TokenizedInput> containing the encoding output (token indices, token types, segment ids, ovrflowing tokens and special token mask) for each provided text

Example
use rust_tokenizers::tokenizer::{BaseTokenizer, MultiThreadedTokenizer, TruncationStrategy};
use rust_tokenizers::vocab::BaseVocab;
let strip_accents = false;
let lower_case = false;
let tokenizer: BaseTokenizer<BaseVocab> =
    BaseTokenizer::from_file("path/to/vocab/file", lower_case, strip_accents).unwrap();

let text_1 = "Hello, world!";
let text_2 = "This is a second sentence";
let text_3 = "Very well thank you.";
let text_4 = "This is another second sentence.";
let encoded_input = tokenizer.encode_pair_list(
    &[(text_1, text_2), (text_3, text_4)],
    5,
    &TruncationStrategy::LongestFirst,
    2,
);
source

fn decode_list( &self, token_ids_list: &[Vec<i64>], skip_special_tokens: bool, clean_up_tokenization_spaces: bool ) -> Vec<String>

Multithreaded conversion a list of sequence of ids (integer) into a string, using the tokenizer and vocabulary with options to remove special tokens and clean up tokenization spaces. This calls decode for each provided sequence of ids

Arguments
  • token_ids: list of list of tokenized input ids. Can be obtained using the encode or encode_plus methods.
  • skip_special_tokens: if set to True, will replace special tokens.
  • clean_up_tokenization_spaces: if set to True, will clean up the tokenization spaces.
Returns
  • String: decoded sentence
Example
use rust_tokenizers::tokenizer::{BaseTokenizer, MultiThreadedTokenizer, TruncationStrategy};
use rust_tokenizers::vocab::BaseVocab;
let strip_accents = false;
let lower_case = false;
let tokenizer: BaseTokenizer<BaseVocab> =
    BaseTokenizer::from_file("path/to/vocab/file", lower_case, strip_accents).unwrap();

let skip_special_tokens = true;
let clean_up_tokenization_spaces = true;
let token_ids_list = vec![vec![0, 1, 2, 42], vec![99, 3]];
let decoded_list = tokenizer.decode_list(
    &token_ids_list,
    skip_special_tokens,
    clean_up_tokenization_spaces,
);

Object Safety§

This trait is not object safe.

Implementors§

source§

impl MultiThreadedTokenizer<AlbertVocab> for AlbertTokenizer

source§

impl MultiThreadedTokenizer<BertVocab> for BertTokenizer

source§

impl MultiThreadedTokenizer<DeBERTaV2Vocab> for DeBERTaV2Tokenizer

source§

impl MultiThreadedTokenizer<DeBERTaVocab> for DeBERTaTokenizer

source§

impl MultiThreadedTokenizer<FNetVocab> for FNetTokenizer

source§

impl MultiThreadedTokenizer<Gpt2Vocab> for Gpt2Tokenizer

source§

impl MultiThreadedTokenizer<M2M100Vocab> for M2M100Tokenizer

source§

impl MultiThreadedTokenizer<MBart50Vocab> for MBart50Tokenizer

source§

impl MultiThreadedTokenizer<MarianVocab> for MarianTokenizer

source§

impl MultiThreadedTokenizer<NLLBVocab> for NLLBTokenizer

source§

impl MultiThreadedTokenizer<OpenAiGptVocab> for CtrlTokenizer

source§

impl MultiThreadedTokenizer<OpenAiGptVocab> for OpenAiGptTokenizer

source§

impl MultiThreadedTokenizer<PegasusVocab> for PegasusTokenizer

source§

impl MultiThreadedTokenizer<ProphetNetVocab> for ProphetNetTokenizer

source§

impl MultiThreadedTokenizer<ReformerVocab> for ReformerTokenizer

source§

impl MultiThreadedTokenizer<RobertaVocab> for RobertaTokenizer

source§

impl MultiThreadedTokenizer<SentencePieceVocab> for SentencePieceBpeTokenizer

source§

impl MultiThreadedTokenizer<SentencePieceVocab> for SentencePieceTokenizer

source§

impl MultiThreadedTokenizer<T5Vocab> for T5Tokenizer

source§

impl MultiThreadedTokenizer<XLMRobertaVocab> for XLMRobertaTokenizer

source§

impl MultiThreadedTokenizer<XLNetVocab> for XLNetTokenizer

source§

impl<T: Vocab + Sync + Send> MultiThreadedTokenizer<T> for BaseTokenizer<T>