Trait rust_tokenizers::tokenizer::MultiThreadedTokenizer[−][src]

pub trait MultiThreadedTokenizer<T: Vocab> where
    Self: Sync + Send + Tokenizer<T>, {
    fn vocab(&self) -> &T { ... }
    fn tokenize_list_with_offsets<S>(
        &self, 
        text_list: &[S]
    ) -> Vec<TokensWithOffsets>ⓘNotable traits for Vec<u8, A>
impl<A> Write for Vec<u8, A> where
    A: Allocator, 
    where
        S: AsRef<str> + Sync,
    { ... }
    fn tokenize_list<S>(&self, text_list: &[S]) -> Vec<Vec<String>>ⓘNotable traits for Vec<u8, A>
impl<A> Write for Vec<u8, A> where
    A: Allocator, 
    where
        S: AsRef<str> + Sync,
    { ... }
    fn encode_list<S>(
        &self, 
        text_list: &[S], 
        max_len: usize, 
        truncation_strategy: &TruncationStrategy, 
        stride: usize
    ) -> Vec<TokenizedInput>ⓘNotable traits for Vec<u8, A>
impl<A> Write for Vec<u8, A> where
    A: Allocator, 
    where
        S: AsRef<str> + Sync,
    { ... }
    fn encode_pair_list<S>(
        &self, 
        text_list: &[(S, S)], 
        max_len: usize, 
        truncation_strategy: &TruncationStrategy, 
        stride: usize
    ) -> Vec<TokenizedInput>ⓘNotable traits for Vec<u8, A>
impl<A> Write for Vec<u8, A> where
    A: Allocator, 
    where
        S: AsRef<str> + Sync,
    { ... }
    fn decode_list(
        &self, 
        token_ids_list: &[Vec<i64>], 
        skip_special_tokens: bool, 
        clean_up_tokenization_spaces: bool
    ) -> Vec<String>ⓘNotable traits for Vec<u8, A>
impl<A> Write for Vec<u8, A> where
    A: Allocator,  { ... }
}

Expand description

Extension for multithreaded tokenizers

Provided methods

[src]

fn vocab(&self) -> &T

returns a reference to the tokenizer vocabulary

[src]

fn tokenize_list_with_offsets<S>(
    &self,
    text_list: &[S]
) -> Vec<TokensWithOffsets>ⓘ
Notable traits for Vec<u8, A>
`impl<A> Write for Vec<u8, A> where     A: Allocator,`
where
    S: AsRef<str> + Sync,

Tokenize a list of strings (with multithreading), where each corresponds to for example a sentence, returns a vector of TokensWithOffsets containing the tokens and their offset information. This calls tokenize_with_offsets on the list provided.

Parameters

text_list: list of strings to tokenize

Returns

Vec<TokensWithOffsets> with the token strings representation and offsets

Example

use rust_tokenizers::tokenizer::{BaseTokenizer, Tokenizer};
use rust_tokenizers::vocab::BaseVocab;
let strip_accents = false;
let lower_case = false;
let tokenizer: BaseTokenizer<BaseVocab> =
    BaseTokenizer::from_file("path/to/vocab/file", lower_case, strip_accents).unwrap();

let text = ["Hello, world!", "Second sentence"];
let tokens = tokenizer.tokenize_list_with_offsets(&text);

[src]

fn tokenize_list<S>(&self, text_list: &[S]) -> Vec<Vec<String>>ⓘ
Notable traits for Vec<u8, A>
`impl<A> Write for Vec<u8, A> where A: Allocator,`
where
S: AsRef<str> + Sync,

Multithreaded tokenization of a list of strings, returning tokens with offset information

Parameters

text_list: list of strings to tokenize

Returns

Vec<Vec<String>> with the token strings representation

Example

use rust_tokenizers::tokenizer::{BaseTokenizer, MultiThreadedTokenizer};
use rust_tokenizers::vocab::BaseVocab;
let strip_accents = false;
let lower_case = false;
let tokenizer: BaseTokenizer<BaseVocab> =
    BaseTokenizer::from_file("path/to/vocab/file", lower_case, strip_accents).unwrap();

let texts = ["Hello, world!", "Second sentence"];
let tokens = tokenizer.tokenize_list(&texts);

[src]

fn encode_list<S>(
    &self,
    text_list: &[S],
    max_len: usize,
    truncation_strategy: &TruncationStrategy,
    stride: usize
) -> Vec<TokenizedInput>ⓘ
Notable traits for Vec<u8, A>
`impl<A> Write for Vec<u8, A> where     A: Allocator,`
where
    S: AsRef<str> + Sync,

Multithreaded encoding of a sequence of string-like texts (tokenization followed by encoding). Not that in contrast with encode optional second text, each text provided is encoded independently.

Parameters

text_list: sequence of input text (&str) to encode combined into a single encoding by using the build_input_with_special_tokens method.
max_len (usize): maximum combined sequence length. If the combined encoding would exceed this max_len, the encoding is truncated following the TruncationStrategy provided.
truncation_strategy (&TruncationStrategy): strategy to follow for the truncation, if required
stride (usize): amount of tokens to shift the input by if truncation is required (allowing for the generation of overlapping sequences with overflowing tokens)

Returns

Vec<TokenizedInput> containing the encoding output (token indices, token types, segment ids, ovrflowing tokens and special token mask) for each provided text

Example

use rust_tokenizers::tokenizer::{BaseTokenizer, MultiThreadedTokenizer, TruncationStrategy};
use rust_tokenizers::vocab::BaseVocab;
let strip_accents = false;
let lower_case = false;
let tokenizer: BaseTokenizer<BaseVocab> =
    BaseTokenizer::from_file("path/to/vocab/file", lower_case, strip_accents).unwrap();

let text_1 = "Hello, world!";
let text_2 = "How is it going?";
let text_3 = "Very well thank you.";
let encoded_input = tokenizer.encode_list(
    &[text_1, text_2, text_3],
    5,
    &TruncationStrategy::LongestFirst,
    2,
);

[src]

fn encode_pair_list<S>(
    &self,
    text_list: &[(S, S)],
    max_len: usize,
    truncation_strategy: &TruncationStrategy,
    stride: usize
) -> Vec<TokenizedInput>ⓘ
Notable traits for Vec<u8, A>
`impl<A> Write for Vec<u8, A> where     A: Allocator,`
where
    S: AsRef<str> + Sync,

Multithreaded ncoding of a sequence of string-like text pairs (tokenization followed by encoding). This combines with encode with the list processing of encode_list.

Parameters

text_list: sequence of input text (&str) to encode combined into a single encoding by using the build_input_with_special_tokens method.
max_len (usize): maximum combined sequence length. If the combined encoding would exceed this max_len, the encoding is truncated following the TruncationStrategy provided.
truncation_strategy (&TruncationStrategy): strategy to follow for the truncation, if required
stride (usize): amount of tokens to shift the input by if truncation is required (allowing for the generation of overlapping sequences with overflowing tokens)

Returns

Vec<TokenizedInput> containing the encoding output (token indices, token types, segment ids, ovrflowing tokens and special token mask) for each provided text

Example

use rust_tokenizers::tokenizer::{BaseTokenizer, MultiThreadedTokenizer, TruncationStrategy};
use rust_tokenizers::vocab::BaseVocab;
let strip_accents = false;
let lower_case = false;
let tokenizer: BaseTokenizer<BaseVocab> =
    BaseTokenizer::from_file("path/to/vocab/file", lower_case, strip_accents).unwrap();

let text_1 = "Hello, world!";
let text_2 = "This is a second sentence";
let text_3 = "Very well thank you.";
let text_4 = "This is another second sentence.";
let encoded_input = tokenizer.encode_pair_list(
    &[(text_1, text_2), (text_3, text_4)],
    5,
    &TruncationStrategy::LongestFirst,
    2,
);

[src]

fn decode_list(
    &self,
    token_ids_list: &[Vec<i64>],
    skip_special_tokens: bool,
    clean_up_tokenization_spaces: bool
) -> Vec<String>ⓘ
Notable traits for Vec<u8, A>
`impl<A> Write for Vec<u8, A> where     A: Allocator,`

Multithreaded conversion a list of sequence of ids (integer) into a string, using the tokenizer and vocabulary with options to remove special tokens and clean up tokenization spaces. This calls decode for each provided sequence of ids

Arguments

token_ids: list of list of tokenized input ids. Can be obtained using the encode or encode_plus methods.
skip_special_tokens: if set to True, will replace special tokens.
clean_up_tokenization_spaces: if set to True, will clean up the tokenization spaces.

Returns

String: decoded sentence

Example

use rust_tokenizers::tokenizer::{BaseTokenizer, MultiThreadedTokenizer, TruncationStrategy};
use rust_tokenizers::vocab::BaseVocab;
let strip_accents = false;
let lower_case = false;
let tokenizer: BaseTokenizer<BaseVocab> =
    BaseTokenizer::from_file("path/to/vocab/file", lower_case, strip_accents).unwrap();

let skip_special_tokens = true;
let clean_up_tokenization_spaces = true;
let token_ids_list = vec![vec![0, 1, 2, 42], vec![99, 3]];
let decoded_list = tokenizer.decode_list(
    &token_ids_list,
    skip_special_tokens,
    clean_up_tokenization_spaces,
);

Implementors

[src]

Trait rust_tokenizers::tokenizer::MultiThreadedTokenizer[−][src]

Provided methods

fn vocab(&self) -> &T

fn tokenize_list_with_offsets<S>( &self, text_list: &[S]) -> Vec<TokensWithOffsets>ⓘNotable traits for Vec<u8, A>impl<A> Write for Vec<u8, A> where A: Allocator, where S: AsRef<str> + Sync,

fn tokenize_list<S>(&self, text_list: &[S]) -> Vec<Vec<String>>ⓘNotable traits for Vec<u8, A>impl<A> Write for Vec<u8, A> where A: Allocator, where S: AsRef<str> + Sync,

fn encode_list<S>( &self, text_list: &[S], max_len: usize, truncation_strategy: &TruncationStrategy, stride: usize) -> Vec<TokenizedInput>ⓘNotable traits for Vec<u8, A>impl<A> Write for Vec<u8, A> where A: Allocator, where S: AsRef<str> + Sync,

fn encode_pair_list<S>( &self, text_list: &[(S, S)], max_len: usize, truncation_strategy: &TruncationStrategy, stride: usize) -> Vec<TokenizedInput>ⓘNotable traits for Vec<u8, A>impl<A> Write for Vec<u8, A> where A: Allocator, where S: AsRef<str> + Sync,

fn decode_list( &self, token_ids_list: &[Vec<i64>], skip_special_tokens: bool, clean_up_tokenization_spaces: bool) -> Vec<String>ⓘNotable traits for Vec<u8, A>impl<A> Write for Vec<u8, A> where A: Allocator,

Implementors

impl MultiThreadedTokenizer<AlbertVocab> for AlbertTokenizer

impl MultiThreadedTokenizer<BertVocab> for BertTokenizer

impl MultiThreadedTokenizer<FNetVocab> for FNetTokenizer

impl MultiThreadedTokenizer<Gpt2Vocab> for Gpt2Tokenizer

impl MultiThreadedTokenizer<M2M100Vocab> for M2M100Tokenizer

impl MultiThreadedTokenizer<MBart50Vocab> for MBart50Tokenizer

impl MultiThreadedTokenizer<MarianVocab> for MarianTokenizer

impl MultiThreadedTokenizer<OpenAiGptVocab> for CtrlTokenizer

impl MultiThreadedTokenizer<OpenAiGptVocab> for OpenAiGptTokenizer

impl MultiThreadedTokenizer<PegasusVocab> for PegasusTokenizer

impl MultiThreadedTokenizer<ProphetNetVocab> for ProphetNetTokenizer

impl MultiThreadedTokenizer<ReformerVocab> for ReformerTokenizer

impl MultiThreadedTokenizer<RobertaVocab> for RobertaTokenizer

impl MultiThreadedTokenizer<SentencePieceVocab> for SentencePieceBpeTokenizer

impl MultiThreadedTokenizer<SentencePieceVocab> for SentencePieceTokenizer

impl MultiThreadedTokenizer<T5Vocab> for T5Tokenizer

impl MultiThreadedTokenizer<XLMRobertaVocab> for XLMRobertaTokenizer

impl MultiThreadedTokenizer<XLNetVocab> for XLNetTokenizer

impl<T: Vocab + Sync + Send> MultiThreadedTokenizer<T> for BaseTokenizer<T>

fn tokenize_list_with_offsets<S>(
&self,
text_list: &[S]
) -> Vec<TokensWithOffsets>ⓘ
Notable traits for Vec<u8, A>
`impl<A> Write for Vec<u8, A> where A: Allocator,`
where
S: AsRef<str> + Sync,

fn tokenize_list<S>(&self, text_list: &[S]) -> Vec<Vec<String>>ⓘ
Notable traits for Vec<u8, A>
`impl<A> Write for Vec<u8, A> where A: Allocator,`
where
S: AsRef<str> + Sync,

fn encode_list<S>(
&self,
text_list: &[S],
max_len: usize,
truncation_strategy: &TruncationStrategy,
stride: usize
) -> Vec<TokenizedInput>ⓘ
Notable traits for Vec<u8, A>
`impl<A> Write for Vec<u8, A> where A: Allocator,`
where
S: AsRef<str> + Sync,

fn encode_pair_list<S>(
&self,
text_list: &[(S, S)],
max_len: usize,
truncation_strategy: &TruncationStrategy,
stride: usize
) -> Vec<TokenizedInput>ⓘ
Notable traits for Vec<u8, A>
`impl<A> Write for Vec<u8, A> where A: Allocator,`
where
S: AsRef<str> + Sync,

fn decode_list(
&self,
token_ids_list: &[Vec<i64>],
skip_special_tokens: bool,
clean_up_tokenization_spaces: bool
) -> Vec<String>ⓘ
Notable traits for Vec<u8, A>
`impl<A> Write for Vec<u8, A> where A: Allocator,`