Trait rust_tokenizers::tokenizer::MultiThreadedTokenizer[][src]

pub trait MultiThreadedTokenizer<T: Vocab> where
    Self: Sync + Send + Tokenizer<T>, 
{ fn vocab(&self) -> &T { ... }
fn tokenize_list_with_offsets<S>(
        &self,
        text_list: &[S]
    ) -> Vec<TokensWithOffsets>
Notable traits for Vec<u8, A>
impl<A> Write for Vec<u8, A> where
    A: Allocator

    where
        S: AsRef<str> + Sync
, { ... }
fn tokenize_list<S>(&self, text_list: &[S]) -> Vec<Vec<String>>
Notable traits for Vec<u8, A>
impl<A> Write for Vec<u8, A> where
    A: Allocator

    where
        S: AsRef<str> + Sync
, { ... }
fn encode_list<S>(
        &self,
        text_list: &[S],
        max_len: usize,
        truncation_strategy: &TruncationStrategy,
        stride: usize
    ) -> Vec<TokenizedInput>
Notable traits for Vec<u8, A>
impl<A> Write for Vec<u8, A> where
    A: Allocator

    where
        S: AsRef<str> + Sync
, { ... }
fn encode_pair_list<S>(
        &self,
        text_list: &[(S, S)],
        max_len: usize,
        truncation_strategy: &TruncationStrategy,
        stride: usize
    ) -> Vec<TokenizedInput>
Notable traits for Vec<u8, A>
impl<A> Write for Vec<u8, A> where
    A: Allocator

    where
        S: AsRef<str> + Sync
, { ... }
fn decode_list(
        &self,
        token_ids_list: &[Vec<i64>],
        skip_special_tokens: bool,
        clean_up_tokenization_spaces: bool
    ) -> Vec<String>
Notable traits for Vec<u8, A>
impl<A> Write for Vec<u8, A> where
    A: Allocator
{ ... } }
Expand description

Provided methods

returns a reference to the tokenizer vocabulary

Tokenize a list of strings (with multithreading), where each corresponds to for example a sentence, returns a vector of TokensWithOffsets containing the tokens and their offset information. This calls tokenize_with_offsets on the list provided.

Parameters
  • text_list: list of strings to tokenize
Returns

Vec<TokensWithOffsets> with the token strings representation and offsets

Example
use rust_tokenizers::tokenizer::{BaseTokenizer, Tokenizer};
use rust_tokenizers::vocab::BaseVocab;
let strip_accents = false;
let lower_case = false;
let tokenizer: BaseTokenizer<BaseVocab> =
    BaseTokenizer::from_file("path/to/vocab/file", lower_case, strip_accents).unwrap();

let text = ["Hello, world!", "Second sentence"];
let tokens = tokenizer.tokenize_list_with_offsets(&text);

Multithreaded tokenization of a list of strings, returning tokens with offset information

Parameters
  • text_list: list of strings to tokenize
Returns

Vec<Vec<String>> with the token strings representation

Example
use rust_tokenizers::tokenizer::{BaseTokenizer, MultiThreadedTokenizer};
use rust_tokenizers::vocab::BaseVocab;
let strip_accents = false;
let lower_case = false;
let tokenizer: BaseTokenizer<BaseVocab> =
    BaseTokenizer::from_file("path/to/vocab/file", lower_case, strip_accents).unwrap();

let texts = ["Hello, world!", "Second sentence"];
let tokens = tokenizer.tokenize_list(&texts);

Multithreaded encoding of a sequence of string-like texts (tokenization followed by encoding). Not that in contrast with encode optional second text, each text provided is encoded independently.

Parameters
  • text_list: sequence of input text (&str) to encode combined into a single encoding by using the build_input_with_special_tokens method.
  • max_len (usize): maximum combined sequence length. If the combined encoding would exceed this max_len, the encoding is truncated following the TruncationStrategy provided.
  • truncation_strategy (&TruncationStrategy): strategy to follow for the truncation, if required
  • stride (usize): amount of tokens to shift the input by if truncation is required (allowing for the generation of overlapping sequences with overflowing tokens)
Returns

Vec<TokenizedInput> containing the encoding output (token indices, token types, segment ids, ovrflowing tokens and special token mask) for each provided text

Example
use rust_tokenizers::tokenizer::{BaseTokenizer, MultiThreadedTokenizer, TruncationStrategy};
use rust_tokenizers::vocab::BaseVocab;
let strip_accents = false;
let lower_case = false;
let tokenizer: BaseTokenizer<BaseVocab> =
    BaseTokenizer::from_file("path/to/vocab/file", lower_case, strip_accents).unwrap();

let text_1 = "Hello, world!";
let text_2 = "How is it going?";
let text_3 = "Very well thank you.";
let encoded_input = tokenizer.encode_list(
    &[text_1, text_2, text_3],
    5,
    &TruncationStrategy::LongestFirst,
    2,
);

Multithreaded ncoding of a sequence of string-like text pairs (tokenization followed by encoding). This combines with encode with the list processing of encode_list.

Parameters
  • text_list: sequence of input text (&str) to encode combined into a single encoding by using the build_input_with_special_tokens method.
  • max_len (usize): maximum combined sequence length. If the combined encoding would exceed this max_len, the encoding is truncated following the TruncationStrategy provided.
  • truncation_strategy (&TruncationStrategy): strategy to follow for the truncation, if required
  • stride (usize): amount of tokens to shift the input by if truncation is required (allowing for the generation of overlapping sequences with overflowing tokens)
Returns

Vec<TokenizedInput> containing the encoding output (token indices, token types, segment ids, ovrflowing tokens and special token mask) for each provided text

Example
use rust_tokenizers::tokenizer::{BaseTokenizer, MultiThreadedTokenizer, TruncationStrategy};
use rust_tokenizers::vocab::BaseVocab;
let strip_accents = false;
let lower_case = false;
let tokenizer: BaseTokenizer<BaseVocab> =
    BaseTokenizer::from_file("path/to/vocab/file", lower_case, strip_accents).unwrap();

let text_1 = "Hello, world!";
let text_2 = "This is a second sentence";
let text_3 = "Very well thank you.";
let text_4 = "This is another second sentence.";
let encoded_input = tokenizer.encode_pair_list(
    &[(text_1, text_2), (text_3, text_4)],
    5,
    &TruncationStrategy::LongestFirst,
    2,
);

Multithreaded conversion a list of sequence of ids (integer) into a string, using the tokenizer and vocabulary with options to remove special tokens and clean up tokenization spaces. This calls decode for each provided sequence of ids

Arguments
  • token_ids: list of list of tokenized input ids. Can be obtained using the encode or encode_plus methods.
  • skip_special_tokens: if set to True, will replace special tokens.
  • clean_up_tokenization_spaces: if set to True, will clean up the tokenization spaces.
Returns
  • String: decoded sentence
Example
use rust_tokenizers::tokenizer::{BaseTokenizer, MultiThreadedTokenizer, TruncationStrategy};
use rust_tokenizers::vocab::BaseVocab;
let strip_accents = false;
let lower_case = false;
let tokenizer: BaseTokenizer<BaseVocab> =
    BaseTokenizer::from_file("path/to/vocab/file", lower_case, strip_accents).unwrap();

let skip_special_tokens = true;
let clean_up_tokenization_spaces = true;
let token_ids_list = vec![vec![0, 1, 2, 42], vec![99, 3]];
let decoded_list = tokenizer.decode_list(
    &token_ids_list,
    skip_special_tokens,
    clean_up_tokenization_spaces,
);

Implementors