pub trait MultiThreadedTokenizer<T: Vocab>where
Self: Sync + Send + Tokenizer<T>,{
// Provided methods
fn vocab(&self) -> &T { ... }
fn tokenize_list_with_offsets<S>(
&self,
text_list: &[S]
) -> Vec<TokensWithOffsets>
where S: AsRef<str> + Sync { ... }
fn tokenize_list<S>(&self, text_list: &[S]) -> Vec<Vec<String>>
where S: AsRef<str> + Sync { ... }
fn encode_list<S>(
&self,
text_list: &[S],
max_len: usize,
truncation_strategy: &TruncationStrategy,
stride: usize
) -> Vec<TokenizedInput>
where S: AsRef<str> + Sync { ... }
fn encode_pair_list<S>(
&self,
text_list: &[(S, S)],
max_len: usize,
truncation_strategy: &TruncationStrategy,
stride: usize
) -> Vec<TokenizedInput>
where S: AsRef<str> + Sync { ... }
fn decode_list(
&self,
token_ids_list: &[Vec<i64>],
skip_special_tokens: bool,
clean_up_tokenization_spaces: bool
) -> Vec<String> { ... }
}
Expand description
Provided Methods§
sourcefn tokenize_list_with_offsets<S>(
&self,
text_list: &[S]
) -> Vec<TokensWithOffsets>where
S: AsRef<str> + Sync,
fn tokenize_list_with_offsets<S>( &self, text_list: &[S] ) -> Vec<TokensWithOffsets>where S: AsRef<str> + Sync,
Tokenize a list of strings (with multithreading), where each corresponds to for example a sentence, returns a
vector of TokensWithOffsets containing the tokens and their offset information. This calls
tokenize_with_offsets
on the list provided.
Parameters
- text_list: list of strings to tokenize
Returns
Vec<TokensWithOffsets>
with the token strings representation and offsets
Example
use rust_tokenizers::tokenizer::{BaseTokenizer, Tokenizer};
use rust_tokenizers::vocab::BaseVocab;
let strip_accents = false;
let lower_case = false;
let tokenizer: BaseTokenizer<BaseVocab> =
BaseTokenizer::from_file("path/to/vocab/file", lower_case, strip_accents).unwrap();
let text = ["Hello, world!", "Second sentence"];
let tokens = tokenizer.tokenize_list_with_offsets(&text);
sourcefn tokenize_list<S>(&self, text_list: &[S]) -> Vec<Vec<String>>where
S: AsRef<str> + Sync,
fn tokenize_list<S>(&self, text_list: &[S]) -> Vec<Vec<String>>where S: AsRef<str> + Sync,
Multithreaded tokenization of a list of strings, returning tokens with offset information
Parameters
- text_list: list of strings to tokenize
Returns
Vec<Vec<String>>
with the token strings representation
Example
use rust_tokenizers::tokenizer::{BaseTokenizer, MultiThreadedTokenizer};
use rust_tokenizers::vocab::BaseVocab;
let strip_accents = false;
let lower_case = false;
let tokenizer: BaseTokenizer<BaseVocab> =
BaseTokenizer::from_file("path/to/vocab/file", lower_case, strip_accents).unwrap();
let texts = ["Hello, world!", "Second sentence"];
let tokens = tokenizer.tokenize_list(&texts);
sourcefn encode_list<S>(
&self,
text_list: &[S],
max_len: usize,
truncation_strategy: &TruncationStrategy,
stride: usize
) -> Vec<TokenizedInput>where
S: AsRef<str> + Sync,
fn encode_list<S>( &self, text_list: &[S], max_len: usize, truncation_strategy: &TruncationStrategy, stride: usize ) -> Vec<TokenizedInput>where S: AsRef<str> + Sync,
Multithreaded encoding of a sequence of string-like texts (tokenization followed by encoding). Not that in contrast
with encode
optional second text, each text provided is encoded independently.
Parameters
- text_list: sequence of input text (
&str
) to encode combined into a single encoding by using thebuild_input_with_special_tokens
method. - max_len (
usize
): maximum combined sequence length. If the combined encoding would exceed this max_len, the encoding is truncated following theTruncationStrategy
provided. - truncation_strategy (
&TruncationStrategy
): strategy to follow for the truncation, if required - stride (
usize
): amount of tokens to shift the input by if truncation is required (allowing for the generation of overlapping sequences with overflowing tokens)
Returns
Vec<TokenizedInput>
containing the encoding output (token indices, token types, segment ids,
ovrflowing tokens and special token mask) for each provided text
Example
use rust_tokenizers::tokenizer::{BaseTokenizer, MultiThreadedTokenizer, TruncationStrategy};
use rust_tokenizers::vocab::BaseVocab;
let strip_accents = false;
let lower_case = false;
let tokenizer: BaseTokenizer<BaseVocab> =
BaseTokenizer::from_file("path/to/vocab/file", lower_case, strip_accents).unwrap();
let text_1 = "Hello, world!";
let text_2 = "How is it going?";
let text_3 = "Very well thank you.";
let encoded_input = tokenizer.encode_list(
&[text_1, text_2, text_3],
5,
&TruncationStrategy::LongestFirst,
2,
);
sourcefn encode_pair_list<S>(
&self,
text_list: &[(S, S)],
max_len: usize,
truncation_strategy: &TruncationStrategy,
stride: usize
) -> Vec<TokenizedInput>where
S: AsRef<str> + Sync,
fn encode_pair_list<S>( &self, text_list: &[(S, S)], max_len: usize, truncation_strategy: &TruncationStrategy, stride: usize ) -> Vec<TokenizedInput>where S: AsRef<str> + Sync,
Multithreaded ncoding of a sequence of string-like text pairs (tokenization followed by encoding). This combines
with encode
with the list processing of encode_list
.
Parameters
- text_list: sequence of input text (
&str
) to encode combined into a single encoding by using thebuild_input_with_special_tokens
method. - max_len (
usize
): maximum combined sequence length. If the combined encoding would exceed this max_len, the encoding is truncated following theTruncationStrategy
provided. - truncation_strategy (
&TruncationStrategy
): strategy to follow for the truncation, if required - stride (
usize
): amount of tokens to shift the input by if truncation is required (allowing for the generation of overlapping sequences with overflowing tokens)
Returns
Vec<TokenizedInput>
containing the encoding output (token indices, token types, segment ids,
ovrflowing tokens and special token mask) for each provided text
Example
use rust_tokenizers::tokenizer::{BaseTokenizer, MultiThreadedTokenizer, TruncationStrategy};
use rust_tokenizers::vocab::BaseVocab;
let strip_accents = false;
let lower_case = false;
let tokenizer: BaseTokenizer<BaseVocab> =
BaseTokenizer::from_file("path/to/vocab/file", lower_case, strip_accents).unwrap();
let text_1 = "Hello, world!";
let text_2 = "This is a second sentence";
let text_3 = "Very well thank you.";
let text_4 = "This is another second sentence.";
let encoded_input = tokenizer.encode_pair_list(
&[(text_1, text_2), (text_3, text_4)],
5,
&TruncationStrategy::LongestFirst,
2,
);
sourcefn decode_list(
&self,
token_ids_list: &[Vec<i64>],
skip_special_tokens: bool,
clean_up_tokenization_spaces: bool
) -> Vec<String>
fn decode_list( &self, token_ids_list: &[Vec<i64>], skip_special_tokens: bool, clean_up_tokenization_spaces: bool ) -> Vec<String>
Multithreaded conversion a list of sequence of ids (integer) into a string, using the tokenizer and vocabulary
with options to remove special tokens and clean up tokenization spaces. This calls decode
for each provided sequence of ids
Arguments
- token_ids: list of list of tokenized input ids. Can be obtained using the
encode
orencode_plus
methods. - skip_special_tokens: if set to True, will replace special tokens.
- clean_up_tokenization_spaces: if set to True, will clean up the tokenization spaces.
Returns
String
: decoded sentence
Example
use rust_tokenizers::tokenizer::{BaseTokenizer, MultiThreadedTokenizer, TruncationStrategy};
use rust_tokenizers::vocab::BaseVocab;
let strip_accents = false;
let lower_case = false;
let tokenizer: BaseTokenizer<BaseVocab> =
BaseTokenizer::from_file("path/to/vocab/file", lower_case, strip_accents).unwrap();
let skip_special_tokens = true;
let clean_up_tokenization_spaces = true;
let token_ids_list = vec![vec![0, 1, 2, 42], vec![99, 3]];
let decoded_list = tokenizer.decode_list(
&token_ids_list,
skip_special_tokens,
clean_up_tokenization_spaces,
);