Trait MultiThreadedTokenizer

Source

pub trait MultiThreadedTokenizer<T: Vocab>where
    Self: Sync + Send + Tokenizer<T>,{
    // Provided methods
    fn vocab(&self) -> &T { ... }
    fn tokenize_list_with_offsets<S>(
        &self,
        text_list: &[S],
    ) -> Vec<TokensWithOffsets>
       where S: AsRef<str> + Sync { ... }
    fn tokenize_list<S>(&self, text_list: &[S]) -> Vec<Vec<String>>
       where S: AsRef<str> + Sync { ... }
    fn encode_list<S>(
        &self,
        text_list: &[S],
        max_len: usize,
        truncation_strategy: &TruncationStrategy,
        stride: usize,
    ) -> Vec<TokenizedInput>
       where S: AsRef<str> + Sync { ... }
    fn encode_pair_list<S>(
        &self,
        text_list: &[(S, S)],
        max_len: usize,
        truncation_strategy: &TruncationStrategy,
        stride: usize,
    ) -> Vec<TokenizedInput>
       where S: AsRef<str> + Sync { ... }
    fn decode_list(
        &self,
        token_ids_list: &[Vec<i64>],
        skip_special_tokens: bool,
        clean_up_tokenization_spaces: bool,
    ) -> Vec<String> { ... }
}

Expand description

§Extension for multithreaded tokenizers

Provided Methods§

Source

fn vocab(&self) -> &T

returns a reference to the tokenizer vocabulary

Source

fn tokenize_list_with_offsets<S>( &self, text_list: &[S], ) -> Vec<TokensWithOffsets>
where S: AsRef<str> + Sync,

Tokenize a list of strings (with multithreading), where each corresponds to for example a sentence, returns a vector of TokensWithOffsets containing the tokens and their offset information. This calls tokenize_with_offsets on the list provided.

§Parameters

text_list: list of strings to tokenize

§Returns

Vec<TokensWithOffsets> with the token strings representation and offsets

§Example

use rust_tokenizers::tokenizer::{BaseTokenizer, Tokenizer};
use rust_tokenizers::vocab::BaseVocab;
let strip_accents = false;
let lower_case = false;
let tokenizer: BaseTokenizer<BaseVocab> =
    BaseTokenizer::from_file("path/to/vocab/file", lower_case, strip_accents).unwrap();

let text = ["Hello, world!", "Second sentence"];
let tokens = tokenizer.tokenize_list_with_offsets(&text);

Source

fn tokenize_list<S>(&self, text_list: &[S]) -> Vec<Vec<String>>
where S: AsRef<str> + Sync,

Multithreaded tokenization of a list of strings, returning tokens with offset information

§Parameters

text_list: list of strings to tokenize

§Returns

Vec<Vec<String>> with the token strings representation

§Example

use rust_tokenizers::tokenizer::{BaseTokenizer, MultiThreadedTokenizer};
use rust_tokenizers::vocab::BaseVocab;
let strip_accents = false;
let lower_case = false;
let tokenizer: BaseTokenizer<BaseVocab> =
    BaseTokenizer::from_file("path/to/vocab/file", lower_case, strip_accents).unwrap();

let texts = ["Hello, world!", "Second sentence"];
let tokens = tokenizer.tokenize_list(&texts);

Source

fn encode_list<S>( &self, text_list: &[S], max_len: usize, truncation_strategy: &TruncationStrategy, stride: usize, ) -> Vec<TokenizedInput>
where S: AsRef<str> + Sync,

Multithreaded encoding of a sequence of string-like texts (tokenization followed by encoding). Not that in contrast with encode optional second text, each text provided is encoded independently.

§Parameters

text_list: sequence of input text (&str) to encode combined into a single encoding by using the build_input_with_special_tokens method.
max_len (usize): maximum combined sequence length. If the combined encoding would exceed this max_len, the encoding is truncated following the TruncationStrategy provided.
truncation_strategy (&TruncationStrategy): strategy to follow for the truncation, if required
stride (usize): amount of tokens to shift the input by if truncation is required (allowing for the generation of overlapping sequences with overflowing tokens)

§Returns

Vec<TokenizedInput> containing the encoding output (token indices, token types, segment ids, ovrflowing tokens and special token mask) for each provided text

§Example

use rust_tokenizers::tokenizer::{BaseTokenizer, MultiThreadedTokenizer, TruncationStrategy};
use rust_tokenizers::vocab::BaseVocab;
let strip_accents = false;
let lower_case = false;
let tokenizer: BaseTokenizer<BaseVocab> =
    BaseTokenizer::from_file("path/to/vocab/file", lower_case, strip_accents).unwrap();

let text_1 = "Hello, world!";
let text_2 = "How is it going?";
let text_3 = "Very well thank you.";
let encoded_input = tokenizer.encode_list(
    &[text_1, text_2, text_3],
    5,
    &TruncationStrategy::LongestFirst,
    2,
);

Source

fn encode_pair_list<S>( &self, text_list: &[(S, S)], max_len: usize, truncation_strategy: &TruncationStrategy, stride: usize, ) -> Vec<TokenizedInput>
where S: AsRef<str> + Sync,

Multithreaded ncoding of a sequence of string-like text pairs (tokenization followed by encoding). This combines with encode with the list processing of encode_list.

§Parameters

text_list: sequence of input text (&str) to encode combined into a single encoding by using the build_input_with_special_tokens method.
max_len (usize): maximum combined sequence length. If the combined encoding would exceed this max_len, the encoding is truncated following the TruncationStrategy provided.
truncation_strategy (&TruncationStrategy): strategy to follow for the truncation, if required
stride (usize): amount of tokens to shift the input by if truncation is required (allowing for the generation of overlapping sequences with overflowing tokens)

§Returns

Vec<TokenizedInput> containing the encoding output (token indices, token types, segment ids, ovrflowing tokens and special token mask) for each provided text

§Example

use rust_tokenizers::tokenizer::{BaseTokenizer, MultiThreadedTokenizer, TruncationStrategy};
use rust_tokenizers::vocab::BaseVocab;
let strip_accents = false;
let lower_case = false;
let tokenizer: BaseTokenizer<BaseVocab> =
    BaseTokenizer::from_file("path/to/vocab/file", lower_case, strip_accents).unwrap();

let text_1 = "Hello, world!";
let text_2 = "This is a second sentence";
let text_3 = "Very well thank you.";
let text_4 = "This is another second sentence.";
let encoded_input = tokenizer.encode_pair_list(
    &[(text_1, text_2), (text_3, text_4)],
    5,
    &TruncationStrategy::LongestFirst,
    2,
);

Source

fn decode_list( &self, token_ids_list: &[Vec<i64>], skip_special_tokens: bool, clean_up_tokenization_spaces: bool, ) -> Vec<String>

Multithreaded conversion a list of sequence of ids (integer) into a string, using the tokenizer and vocabulary with options to remove special tokens and clean up tokenization spaces. This calls decode for each provided sequence of ids

§Arguments

token_ids: list of list of tokenized input ids. Can be obtained using the encode or encode_plus methods.
skip_special_tokens: if set to True, will replace special tokens.
clean_up_tokenization_spaces: if set to True, will clean up the tokenization spaces.

§Returns

String: decoded sentence

§Example

use rust_tokenizers::tokenizer::{BaseTokenizer, MultiThreadedTokenizer, TruncationStrategy};
use rust_tokenizers::vocab::BaseVocab;
let strip_accents = false;
let lower_case = false;
let tokenizer: BaseTokenizer<BaseVocab> =
    BaseTokenizer::from_file("path/to/vocab/file", lower_case, strip_accents).unwrap();

let skip_special_tokens = true;
let clean_up_tokenization_spaces = true;
let token_ids_list = vec![vec![0, 1, 2, 42], vec![99, 3]];
let decoded_list = tokenizer.decode_list(
    &token_ids_list,
    skip_special_tokens,
    clean_up_tokenization_spaces,
);

Dyn Compatibility§

This trait is not dyn compatible.

In older versions of Rust, dyn compatibility was called "object safety", so this trait is not object safe.

Trait MultiThreadedTokenizerCopy item path

§Extension for multithreaded tokenizers

Provided Methods§

fn vocab(&self) -> &T

fn tokenize_list_with_offsets<S>( &self, text_list: &[S], ) -> Vec<TokensWithOffsets>where S: AsRef<str> + Sync,

§Parameters

§Returns

§Example

fn tokenize_list<S>(&self, text_list: &[S]) -> Vec<Vec<String>>where S: AsRef<str> + Sync,

§Parameters

§Returns

§Example

fn encode_list<S>( &self, text_list: &[S], max_len: usize, truncation_strategy: &TruncationStrategy, stride: usize, ) -> Vec<TokenizedInput>where S: AsRef<str> + Sync,

§Parameters

§Returns

§Example

fn encode_pair_list<S>( &self, text_list: &[(S, S)], max_len: usize, truncation_strategy: &TruncationStrategy, stride: usize, ) -> Vec<TokenizedInput>where S: AsRef<str> + Sync,

§Parameters

§Returns

§Example

fn decode_list( &self, token_ids_list: &[Vec<i64>], skip_special_tokens: bool, clean_up_tokenization_spaces: bool, ) -> Vec<String>

§Arguments

§Returns

§Example

Dyn Compatibility§

Implementors§

impl MultiThreadedTokenizer<AlbertVocab> for AlbertTokenizer

impl MultiThreadedTokenizer<BertVocab> for BertTokenizer

impl MultiThreadedTokenizer<DeBERTaV2Vocab> for DeBERTaV2Tokenizer

impl MultiThreadedTokenizer<DeBERTaVocab> for DeBERTaTokenizer

impl MultiThreadedTokenizer<FNetVocab> for FNetTokenizer

impl MultiThreadedTokenizer<Gpt2Vocab> for Gpt2Tokenizer

impl MultiThreadedTokenizer<M2M100Vocab> for M2M100Tokenizer

impl MultiThreadedTokenizer<MBart50Vocab> for MBart50Tokenizer

impl MultiThreadedTokenizer<MarianVocab> for MarianTokenizer

impl MultiThreadedTokenizer<NLLBVocab> for NLLBTokenizer

impl MultiThreadedTokenizer<OpenAiGptVocab> for CtrlTokenizer

impl MultiThreadedTokenizer<OpenAiGptVocab> for OpenAiGptTokenizer

impl MultiThreadedTokenizer<PegasusVocab> for PegasusTokenizer

impl MultiThreadedTokenizer<ProphetNetVocab> for ProphetNetTokenizer

impl MultiThreadedTokenizer<ReformerVocab> for ReformerTokenizer

impl MultiThreadedTokenizer<RobertaVocab> for RobertaTokenizer

impl MultiThreadedTokenizer<SentencePieceVocab> for SentencePieceBpeTokenizer

impl MultiThreadedTokenizer<SentencePieceVocab> for SentencePieceTokenizer

impl MultiThreadedTokenizer<T5Vocab> for T5Tokenizer

impl MultiThreadedTokenizer<XLMRobertaVocab> for XLMRobertaTokenizer

impl MultiThreadedTokenizer<XLNetVocab> for XLNetTokenizer

impl<T: Vocab + Sync + Send> MultiThreadedTokenizer<T> for BaseTokenizer<T>

Trait MultiThreadedTokenizer

fn tokenize_list_with_offsets<S>( &self, text_list: &[S], ) -> Vec<TokensWithOffsets>
where S: AsRef<str> + Sync,

fn tokenize_list<S>(&self, text_list: &[S]) -> Vec<Vec<String>>
where S: AsRef<str> + Sync,

fn encode_list<S>( &self, text_list: &[S], max_len: usize, truncation_strategy: &TruncationStrategy, stride: usize, ) -> Vec<TokenizedInput>
where S: AsRef<str> + Sync,

fn encode_pair_list<S>( &self, text_list: &[(S, S)], max_len: usize, truncation_strategy: &TruncationStrategy, stride: usize, ) -> Vec<TokenizedInput>
where S: AsRef<str> + Sync,