Trait rust_tokenizers::tokenizer::Tokenizer
source · pub trait Tokenizer<T: Vocab> {
Show 19 methods
// Required methods
fn vocab(&self) -> &T;
fn vocab_mut(&mut self) -> &mut T;
fn tokenize_to_tokens(&self, text: TokenRef<'_>) -> Vec<Token>;
// Provided methods
fn tokenize(&self, text: &str) -> Vec<String> { ... }
fn tokenize_with_offsets(&self, text: &str) -> TokensWithOffsets { ... }
fn tokenize_list<S>(&self, text_list: &[S]) -> Vec<Vec<String>>
where S: AsRef<str> { ... }
fn tokenize_list_with_offsets<S>(
&self,
text_list: &[S]
) -> Vec<TokensWithOffsets>
where S: AsRef<str> { ... }
fn convert_tokens_to_ids<S>(&self, tokens: &[S]) -> Vec<i64>
where S: AsRef<str> { ... }
fn encode(
&self,
text_1: &str,
text_2: Option<&str>,
max_len: usize,
truncation_strategy: &TruncationStrategy,
stride: usize
) -> TokenizedInput { ... }
fn encode_list<S>(
&self,
text_list: &[S],
max_len: usize,
truncation_strategy: &TruncationStrategy,
stride: usize
) -> Vec<TokenizedInput>
where S: AsRef<str> { ... }
fn encode_pair_list<S>(
&self,
text_list: &[(S, S)],
max_len: usize,
truncation_strategy: &TruncationStrategy,
stride: usize
) -> Vec<TokenizedInput>
where S: AsRef<str> { ... }
fn decode_to_vec(
&self,
token_ids: &[i64],
skip_special_tokens: bool
) -> Vec<String> { ... }
fn decode(
&self,
token_ids: &[i64],
skip_special_tokens: bool,
clean_up_tokenization_spaces: bool
) -> String { ... }
fn convert_tokens_to_string(&self, tokens: Vec<String>) -> String { ... }
fn clean_up_tokenization(&self, input_string: String) -> String { ... }
fn decode_list(
&self,
token_ids_list: &[Vec<i64>],
skip_special_tokens: bool,
clean_up_tokenization_spaces: bool
) -> Vec<String> { ... }
fn build_input_with_special_tokens(
&self,
tokens_ids_with_offsets_1: TokenIdsWithOffsets,
tokens_ids_with_offsets_2: Option<TokenIdsWithOffsets>
) -> TokenIdsWithSpecialTokens { ... }
fn add_tokens(&mut self, tokens: &[&str]) { ... }
fn add_extra_ids(&mut self, num_extra_ids: i64) { ... }
}
Expand description
Required Methods§
sourcefn tokenize_to_tokens(&self, text: TokenRef<'_>) -> Vec<Token>
fn tokenize_to_tokens(&self, text: TokenRef<'_>) -> Vec<Token>
Tokenize a TokenRef, returning a sequence of tokens
Parameters
- text (
TokenRef
): TokenRef to tokenize (this is especially useful for nested tokenization, where a tokenizer is called on the ouput of a pre-tokenizer, such as BERT).
Returns
Vec<Token>
tokenization of the original TokenRef
Example
use itertools::Itertools;
use rust_tokenizers::tokenizer::{BaseTokenizer, Tokenizer};
use rust_tokenizers::vocab::BaseVocab;
use rust_tokenizers::{OffsetSize, TokenRef};
let strip_accents = false;
let lower_case = false;
let tokenizer: BaseTokenizer<BaseVocab> =
BaseTokenizer::from_file("path/to/vocab/file", lower_case, strip_accents).unwrap();
let text = "Hello, world!";
let offsets = (0..text.len() as OffsetSize).collect_vec();
let text = TokenRef::new(text, &offsets);
let tokens = tokenizer.tokenize_to_tokens(text);
Provided Methods§
sourcefn tokenize(&self, text: &str) -> Vec<String>
fn tokenize(&self, text: &str) -> Vec<String>
Tokenize a string, returns a vector of tokens as strings.
Use tokenize_with_offsets
or tokenize_to_tokens
to return offset information.
Parameters
- text : text (string-like) to tokenize
Returns
Vec<String>
containing the tokens string representation
Example
use rust_tokenizers::tokenizer::{BaseTokenizer, Tokenizer};
use rust_tokenizers::vocab::BaseVocab;
let strip_accents = false;
let lower_case = false;
let tokenizer: BaseTokenizer<BaseVocab> =
BaseTokenizer::from_file("path/to/vocab/file", lower_case, strip_accents).unwrap();
let text = "Hello, world!";
let tokens = tokenizer.tokenize(text);
sourcefn tokenize_with_offsets(&self, text: &str) -> TokensWithOffsets
fn tokenize_with_offsets(&self, text: &str) -> TokensWithOffsets
Tokenize a string, returning tokens with offset information
Parameters
- text : text (string-like) to tokenize
Returns
TokensWithOffsets
with the tokens and their offset information
Example
use rust_tokenizers::tokenizer::{BaseTokenizer, Tokenizer};
use rust_tokenizers::vocab::BaseVocab;
let strip_accents = false;
let lower_case = false;
let tokenizer: BaseTokenizer<BaseVocab> =
BaseTokenizer::from_file("path/to/vocab/file", lower_case, strip_accents).unwrap();
let text = "Hello, world!";
let tokens = tokenizer.tokenize_with_offsets(text);
sourcefn tokenize_list<S>(&self, text_list: &[S]) -> Vec<Vec<String>>where
S: AsRef<str>,
fn tokenize_list<S>(&self, text_list: &[S]) -> Vec<Vec<String>>where S: AsRef<str>,
Tokenize a list of strings, returning tokens with offset information
Parameters
- text_list: list of strings to tokenize
Returns
Vec<Vec<String>>
with the token strings representation
Example
use rust_tokenizers::tokenizer::{BaseTokenizer, Tokenizer};
use rust_tokenizers::vocab::BaseVocab;
let strip_accents = false;
let lower_case = false;
let tokenizer: BaseTokenizer<BaseVocab> =
BaseTokenizer::from_file("path/to/vocab/file", lower_case, strip_accents).unwrap();
let texts = ["Hello, world!", "Second sentence"];
let tokens = tokenizer.tokenize_list(&texts);
sourcefn tokenize_list_with_offsets<S>(
&self,
text_list: &[S]
) -> Vec<TokensWithOffsets>where
S: AsRef<str>,
fn tokenize_list_with_offsets<S>( &self, text_list: &[S] ) -> Vec<TokensWithOffsets>where S: AsRef<str>,
Tokenize a list of strings, where each corresponds to for example a sentence, returns a
vector of TokensWithOffsets containing the tokens and their offset information. This calls
tokenize_with_offsets
on the list provided.
Parameters
- text_list: list of strings to tokenize
Returns
Vec<TokensWithOffsets>
with the token strings representation and offsets
Example
use rust_tokenizers::tokenizer::{BaseTokenizer, Tokenizer};
use rust_tokenizers::vocab::BaseVocab;
let strip_accents = false;
let lower_case = false;
let tokenizer: BaseTokenizer<BaseVocab> =
BaseTokenizer::from_file("path/to/vocab/file", lower_case, strip_accents).unwrap();
let text = ["Hello, world!", "Second sentence"];
let tokens = tokenizer.tokenize_list_with_offsets(&text);
sourcefn convert_tokens_to_ids<S>(&self, tokens: &[S]) -> Vec<i64>where
S: AsRef<str>,
fn convert_tokens_to_ids<S>(&self, tokens: &[S]) -> Vec<i64>where S: AsRef<str>,
Convert a slice of string-like to a vector ot token indices
Parameters
- tokens: list of token string-like to convert to ids
Returns
Vec<i64>
with the token indices
Example
use rust_tokenizers::tokenizer::{BaseTokenizer, Tokenizer};
use rust_tokenizers::vocab::BaseVocab;
let strip_accents = false;
let lower_case = false;
let tokenizer: BaseTokenizer<BaseVocab> =
BaseTokenizer::from_file("path/to/vocab/file", lower_case, strip_accents).unwrap();
let tokens = ["Hello", ",", "world", "!"];
let token_ids = tokenizer.convert_tokens_to_ids(&tokens);
sourcefn encode(
&self,
text_1: &str,
text_2: Option<&str>,
max_len: usize,
truncation_strategy: &TruncationStrategy,
stride: usize
) -> TokenizedInput
fn encode( &self, text_1: &str, text_2: Option<&str>, max_len: usize, truncation_strategy: &TruncationStrategy, stride: usize ) -> TokenizedInput
Encode a string-like (tokenization followed by encoding)
Parameters
- text_1: input text (string-like) to encode
- text_2: optional additional input text (string-like) to encode. When provided, both texts are
combined into a single encoding by using the
build_input_with_special_tokens
method. - max_len (
usize
): maximum combined sequence length. If the combined encoding would exceed this max_len, the encoding is truncated following theTruncationStrategy
provided. - truncation_strategy (
&TruncationStrategy
): strategy to follow for the truncation, if required - stride (
usize
): amount of tokens to shift the input by if truncation is required (allowing for the generation of overlapping sequences with overflowing tokens)
Returns
TokenizedInput
containing the encoding output (token indices, token types, segment ids,
ovrflowing tokens and special token mask)
Example
use rust_tokenizers::tokenizer::{BaseTokenizer, Tokenizer, TruncationStrategy};
use rust_tokenizers::vocab::BaseVocab;
let strip_accents = false;
let lower_case = false;
let tokenizer: BaseTokenizer<BaseVocab> =
BaseTokenizer::from_file("path/to/vocab/file", lower_case, strip_accents).unwrap();
let text_1 = "Hello, world!";
let text_2 = "How is it going?";
let encoded_input = tokenizer.encode(
text_1,
Some(text_2),
5,
&TruncationStrategy::LongestFirst,
2,
);
sourcefn encode_list<S>(
&self,
text_list: &[S],
max_len: usize,
truncation_strategy: &TruncationStrategy,
stride: usize
) -> Vec<TokenizedInput>where
S: AsRef<str>,
fn encode_list<S>( &self, text_list: &[S], max_len: usize, truncation_strategy: &TruncationStrategy, stride: usize ) -> Vec<TokenizedInput>where S: AsRef<str>,
Encode a sequence of string-like texts (tokenization followed by encoding). Not that in contrast
with encode
optional second text, each text provided is encoded independently.
Parameters
- text_list: sequence of input text (
&str
) to encode combined into a single encoding by using thebuild_input_with_special_tokens
method. - max_len (
usize
): maximum combined sequence length. If the combined encoding would exceed this max_len, the encoding is truncated following theTruncationStrategy
provided. - truncation_strategy (
&TruncationStrategy
): strategy to follow for the truncation, if required - stride (
usize
): amount of tokens to shift the input by if truncation is required (allowing for the generation of overlapping sequences with overflowing tokens)
Returns
Vec<TokenizedInput>
containing the encoding output (token indices, token types, segment ids,
ovrflowing tokens and special token mask) for each provided text
Example
use rust_tokenizers::tokenizer::{BaseTokenizer, Tokenizer, TruncationStrategy};
use rust_tokenizers::vocab::BaseVocab;
let strip_accents = false;
let lower_case = false;
let tokenizer: BaseTokenizer<BaseVocab> =
BaseTokenizer::from_file("path/to/vocab/file", lower_case, strip_accents).unwrap();
let text_1 = "Hello, world!";
let text_2 = "How is it going?";
let text_3 = "Very well thank you.";
let encoded_input = tokenizer.encode_list(
&[text_1, text_2, text_3],
5,
&TruncationStrategy::LongestFirst,
2,
);
sourcefn encode_pair_list<S>(
&self,
text_list: &[(S, S)],
max_len: usize,
truncation_strategy: &TruncationStrategy,
stride: usize
) -> Vec<TokenizedInput>where
S: AsRef<str>,
fn encode_pair_list<S>( &self, text_list: &[(S, S)], max_len: usize, truncation_strategy: &TruncationStrategy, stride: usize ) -> Vec<TokenizedInput>where S: AsRef<str>,
Encode a sequence of string-like text pairs (tokenization followed by encoding). This combines
with encode
with the list processing of encode_list
.
Parameters
- text_list: sequence of input text (
&str
) to encode combined into a single encoding by using thebuild_input_with_special_tokens
method. - max_len (
usize
): maximum combined sequence length. If the combined encoding would exceed this max_len, the encoding is truncated following theTruncationStrategy
provided. - truncation_strategy (
&TruncationStrategy
): strategy to follow for the truncation, if required - stride (
usize
): amount of tokens to shift the input by if truncation is required (allowing for the generation of overlapping sequences with overflowing tokens)
Returns
Vec<TokenizedInput>
containing the encoding output (token indices, token types, segment ids,
ovrflowing tokens and special token mask) for each provided text
Example
use rust_tokenizers::tokenizer::{BaseTokenizer, Tokenizer, TruncationStrategy};
use rust_tokenizers::vocab::BaseVocab;
let strip_accents = false;
let lower_case = false;
let tokenizer: BaseTokenizer<BaseVocab> =
BaseTokenizer::from_file("path/to/vocab/file", lower_case, strip_accents).unwrap();
let text_1 = "Hello, world!";
let text_2 = "This is a second sentence";
let text_3 = "Very well thank you.";
let text_4 = "This is another second sentence.";
let encoded_input = tokenizer.encode_pair_list(
&[(text_1, text_2), (text_3, text_4)],
5,
&TruncationStrategy::LongestFirst,
2,
);
sourcefn decode_to_vec(
&self,
token_ids: &[i64],
skip_special_tokens: bool
) -> Vec<String>
fn decode_to_vec( &self, token_ids: &[i64], skip_special_tokens: bool ) -> Vec<String>
Decode a sequence of token indices to a sequence of Strings, optionally skipping special indices
Parameters
- token_ids (
Vec<i64>
): tokens to decode - skip_special_tokens (
bool
): flag indicating if special tokens should be included in the output
Returns
Vec<String>
decoded token indices
Example
use rust_tokenizers::tokenizer::{BaseTokenizer, Tokenizer, TruncationStrategy};
use rust_tokenizers::vocab::BaseVocab;
let strip_accents = false;
let lower_case = false;
let tokenizer: BaseTokenizer<BaseVocab> =
BaseTokenizer::from_file("path/to/vocab/file", lower_case, strip_accents).unwrap();
let tokens_ids = vec![0, 1, 2, 42];
let tokens = tokenizer.decode_to_vec(&tokens_ids, false);
sourcefn decode(
&self,
token_ids: &[i64],
skip_special_tokens: bool,
clean_up_tokenization_spaces: bool
) -> String
fn decode( &self, token_ids: &[i64], skip_special_tokens: bool, clean_up_tokenization_spaces: bool ) -> String
Converts a sequence of ids (integer) into a string, using the tokenizer and vocabulary with options to remove special tokens and clean up tokenization spaces.
Arguments
- token_ids: list of tokenized input ids. Can be obtained using the
encode
orencode_plus
methods. - skip_special_tokens: if set to True, will replace special tokens.
- clean_up_tokenization_spaces: if set to True, will clean up the tokenization spaces.
Returns
String
: decoded sentence
Example
use rust_tokenizers::tokenizer::{BaseTokenizer, Tokenizer, TruncationStrategy};
use rust_tokenizers::vocab::BaseVocab;
let strip_accents = false;
let lower_case = false;
let tokenizer: BaseTokenizer<BaseVocab> =
BaseTokenizer::from_file("path/to/vocab/file", lower_case, strip_accents).unwrap();
let skip_special_tokens = true;
let clean_up_tokenization_spaces = true;
let tokens = vec![0, 1, 2, 42];
let decoded = tokenizer.decode(&tokens, skip_special_tokens, clean_up_tokenization_spaces);
sourcefn convert_tokens_to_string(&self, tokens: Vec<String>) -> String
fn convert_tokens_to_string(&self, tokens: Vec<String>) -> String
Converts a sequence of strings into a single string. This will clean-up artifacts from tokenization
(for example sub ##word
) and generate a single output string
Arguments
- tokens: list of tokens to concatenate.
Returns
String
: concatenated sentence string
Example
use rust_tokenizers::tokenizer::{BaseTokenizer, Tokenizer, TruncationStrategy};
use rust_tokenizers::vocab::BaseVocab;
let strip_accents = false;
let lower_case = false;
let tokenizer: BaseTokenizer<BaseVocab> =
BaseTokenizer::from_file("path/to/vocab/file", lower_case, strip_accents).unwrap();
let skip_special_tokens = true;
let clean_up_tokenization_spaces = true;
let tokens = vec![
"Hello".to_string(),
",".to_string(),
"World".to_string(),
"!".to_string(),
];
let decoded = tokenizer.convert_tokens_to_string(tokens);
sourcefn clean_up_tokenization(&self, input_string: String) -> String
fn clean_up_tokenization(&self, input_string: String) -> String
Cleans-up tokenization artifacts (for example whitespace before punctuation)
Arguments
- input_string (
String
): input string to clean up
Returns
String
: clean-up string
Example
use rust_tokenizers::tokenizer::{BaseTokenizer, Tokenizer, TruncationStrategy};
use rust_tokenizers::vocab::BaseVocab;
let strip_accents = false;
let lower_case = false;
let tokenizer: BaseTokenizer<BaseVocab> =
BaseTokenizer::from_file("path/to/vocab/file", lower_case, strip_accents).unwrap();
let skip_special_tokens = true;
let clean_up_tokenization_spaces = true;
let input_string = "Hello . Do n't pay attention to the punctuation .".to_string();
let cleaned_string = tokenizer.clean_up_tokenization(input_string);
sourcefn decode_list(
&self,
token_ids_list: &[Vec<i64>],
skip_special_tokens: bool,
clean_up_tokenization_spaces: bool
) -> Vec<String>
fn decode_list( &self, token_ids_list: &[Vec<i64>], skip_special_tokens: bool, clean_up_tokenization_spaces: bool ) -> Vec<String>
Converts a list of sequence of ids (integer) into a string, using the tokenizer and vocabulary
with options to remove special tokens and clean up tokenization spaces. This calls decode
for each provided sequence of ids
Arguments
- token_ids: list of list of tokenized input ids. Can be obtained using the
encode
orencode_plus
methods. - skip_special_tokens: if set to True, will replace special tokens.
- clean_up_tokenization_spaces: if set to True, will clean up the tokenization spaces.
Returns
String
: decoded sentence
Example
use rust_tokenizers::tokenizer::{BaseTokenizer, Tokenizer, TruncationStrategy};
use rust_tokenizers::vocab::BaseVocab;
let strip_accents = false;
let lower_case = false;
let tokenizer: BaseTokenizer<BaseVocab> =
BaseTokenizer::from_file("path/to/vocab/file", lower_case, strip_accents).unwrap();
let skip_special_tokens = true;
let clean_up_tokenization_spaces = true;
let token_ids_list = vec![vec![0, 1, 2, 42], vec![99, 3]];
let decoded_list = tokenizer.decode_list(
&token_ids_list,
skip_special_tokens,
clean_up_tokenization_spaces,
);
sourcefn build_input_with_special_tokens(
&self,
tokens_ids_with_offsets_1: TokenIdsWithOffsets,
tokens_ids_with_offsets_2: Option<TokenIdsWithOffsets>
) -> TokenIdsWithSpecialTokens
fn build_input_with_special_tokens( &self, tokens_ids_with_offsets_1: TokenIdsWithOffsets, tokens_ids_with_offsets_2: Option<TokenIdsWithOffsets> ) -> TokenIdsWithSpecialTokens
Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and adding special tokens.
For example, a RoBERTa sequence has the following format:
- single sequence:
X - pair of sequences:
AB
Parameters
- tokens_ids_with_offsets_1 (
TokenIdsWithOffsets
): first sequence - tokens_ids_with_offsets_2 (
TokenIdsWithOffsets
): (optional) second sequence
Returns
TokenIdsWithSpecialTokens
containing a concatenation of both sequences with added special tokens
Example
use rust_tokenizers::tokenizer::{BaseTokenizer, Tokenizer, TruncationStrategy};
use rust_tokenizers::vocab::BaseVocab;
use rust_tokenizers::TokenIdsWithOffsets;
let strip_accents = false;
let lower_case = false;
let tokenizer: BaseTokenizer<BaseVocab> =
BaseTokenizer::from_file("path/to/vocab/file", lower_case, strip_accents).unwrap();
let skip_special_tokens = true;
let clean_up_tokenization_spaces = true;
let first_sequence = "Hello, world";
let second_sequence = "This is the second sentence";
let first_tokens = tokenizer.tokenize_with_offsets(first_sequence);
let first_ids = tokenizer.convert_tokens_to_ids(&first_tokens.tokens);
let first_input = TokenIdsWithOffsets {
ids: first_ids,
offsets: first_tokens.offsets,
reference_offsets: first_tokens.reference_offsets,
masks: first_tokens.masks,
};
let second_tokens = tokenizer.tokenize_with_offsets(second_sequence);
let second_ids = tokenizer.convert_tokens_to_ids(&second_tokens.tokens);
let second_input = TokenIdsWithOffsets {
ids: second_ids,
offsets: second_tokens.offsets,
reference_offsets: second_tokens.reference_offsets,
masks: second_tokens.masks,
};
let combined_with_special_tokens =
tokenizer.build_input_with_special_tokens(first_input, Some(second_input));
sourcefn add_tokens(&mut self, tokens: &[&str])
fn add_tokens(&mut self, tokens: &[&str])
Add arbitrary tokens to the vocabulary.
These tokens are added to the special token map and are ignored from the tokenization algorithm chosen (e.g.
Parameters
- tokens (
&[&str]
): list of tokens to add to the vocabulary
sourcefn add_extra_ids(&mut self, num_extra_ids: i64)
fn add_extra_ids(&mut self, num_extra_ids: i64)
Add arbitrary tokens to the vocabulary.
These tokens are added to the special token map and are ignored from the tokenization algorithm chosen (e.g.
Parameters
- num_extra_ids (
i64
): number of tokens to append