[−][src]Trait rust_tokenizers::preprocessing::tokenizer::base_tokenizer::Tokenizer
Required methods
fn vocab(&self) -> &T
fn tokenize_to_tokens(&self, text: TokenRef<'_>) -> Vec<Token>
Tokenize a text, returns a vector of tokens (contains offset information and more)
Provided methods
fn tokenize(&self, text: &str) -> Vec<String>
Tokenize a string, returns a vector of tokens as strings.
Use tokenize_with_offsets
or tokenize_to_tokens
if you also want offset information.
fn tokenize_with_offsets(
&self,
text: &str
) -> (Vec<String>, Vec<Option<Offset>>, Vec<Vec<OffsetSize>>, Vec<Mask>)
&self,
text: &str
) -> (Vec<String>, Vec<Option<Offset>>, Vec<Vec<OffsetSize>>, Vec<Mask>)
Tokenize a string, return offset information
fn tokenize_list(&self, text_list: Vec<&str>) -> Vec<Vec<String>>
Tokenize a vector of strings, where each corresponds to for example a sentence, returns a vector of vectors of strings.
Use tokenize_list_with_offsets
if you also want offset information.
fn tokenize_list_with_offsets(
&self,
text_list: Vec<&str>
) -> Vec<(Vec<String>, Vec<Option<Offset>>, Vec<Vec<OffsetSize>>, Vec<Mask>)>
&self,
text_list: Vec<&str>
) -> Vec<(Vec<String>, Vec<Option<Offset>>, Vec<Vec<OffsetSize>>, Vec<Mask>)>
Tokenize a vector of strings, where each corresponds to for example a sentence, returns a vector of pairs consists of a vector of tokens and a list of offset information.
fn convert_tokens_to_ids(&self, tokens: &Vec<String>) -> Vec<i64>
fn encode(
&self,
text_1: &str,
text_2: Option<&str>,
max_len: usize,
truncation_strategy: &TruncationStrategy,
stride: usize
) -> TokenizedInput
&self,
text_1: &str,
text_2: Option<&str>,
max_len: usize,
truncation_strategy: &TruncationStrategy,
stride: usize
) -> TokenizedInput
fn encode_list(
&self,
text_list: Vec<&str>,
max_len: usize,
truncation_strategy: &TruncationStrategy,
stride: usize
) -> Vec<TokenizedInput>
&self,
text_list: Vec<&str>,
max_len: usize,
truncation_strategy: &TruncationStrategy,
stride: usize
) -> Vec<TokenizedInput>
fn encode_pair_list(
&self,
text_list: Vec<(&str, &str)>,
max_len: usize,
truncation_strategy: &TruncationStrategy,
stride: usize
) -> Vec<TokenizedInput>
&self,
text_list: Vec<(&str, &str)>,
max_len: usize,
truncation_strategy: &TruncationStrategy,
stride: usize
) -> Vec<TokenizedInput>
fn decode_to_vec(
&self,
token_ids: Vec<i64>,
skip_special_tokens: bool
) -> Vec<String>
&self,
token_ids: Vec<i64>,
skip_special_tokens: bool
) -> Vec<String>
fn decode(
&self,
token_ids: Vec<i64>,
skip_special_tokens: bool,
clean_up_tokenization_spaces: bool
) -> String
&self,
token_ids: Vec<i64>,
skip_special_tokens: bool,
clean_up_tokenization_spaces: bool
) -> String
Converts a sequence of ids (integer) into astring, using the tokenizer and vocabulary with options to remove special tokens and clean up tokenization spaces. Args:
- token_ids: list of tokenized input ids. Can be obtained using the
encode
orencode_plus
methods. - skip_special_tokens: if set to True, will replace special tokens.
- clean_up_tokenization_spaces: if set to True, will clean up the tokenization spaces.
fn convert_tokens_to_string(&self, tokens: Vec<String>) -> String
fn clean_up_tokenization(&self, input_string: String) -> String
fn decode_list(
&self,
token_ids_list: Vec<Vec<i64>>,
skip_special_tokens: bool,
clean_up_tokenization_spaces: bool
) -> Vec<String>
&self,
token_ids_list: Vec<Vec<i64>>,
skip_special_tokens: bool,
clean_up_tokenization_spaces: bool
) -> Vec<String>
fn build_input_with_special_tokens(
&self,
tokens_1: Vec<i64>,
tokens_2: Option<Vec<i64>>,
offsets_1: Vec<Option<Offset>>,
offsets_2: Option<Vec<Option<Offset>>>,
original_offsets_1: Vec<Vec<OffsetSize>>,
original_offsets_2: Option<Vec<Vec<OffsetSize>>>,
mask: Vec<Mask>,
mask_2: Option<Vec<Mask>>
) -> (Vec<i64>, Vec<i8>, Vec<i8>, Vec<Option<Offset>>, Vec<Vec<OffsetSize>>, Vec<Mask>)
&self,
tokens_1: Vec<i64>,
tokens_2: Option<Vec<i64>>,
offsets_1: Vec<Option<Offset>>,
offsets_2: Option<Vec<Option<Offset>>>,
original_offsets_1: Vec<Vec<OffsetSize>>,
original_offsets_2: Option<Vec<Vec<OffsetSize>>>,
mask: Vec<Mask>,
mask_2: Option<Vec<Mask>>
) -> (Vec<i64>, Vec<i8>, Vec<i8>, Vec<Option<Offset>>, Vec<Vec<OffsetSize>>, Vec<Mask>)
Build model inputs from a sequence or a pair of sequence for sequence classification tasks
by concatenating and adding special tokens.
A RoBERTa sequence has the following format:
single sequence: X
pair of sequences: A B
Returns a tuple of:
- output token IDs
- token segment IDs
- special token mask
- offsets (as a vector of
Option<Offset>
because some added markers may not have associated offsets - token mask
Implementors
impl Tokenizer<AlbertVocab> for AlbertTokenizer
[src]
fn vocab(&self) -> &AlbertVocab
[src]
fn tokenize_to_tokens(&self, text: TokenRef<'_>) -> Vec<Token>
[src]
fn convert_tokens_to_string(&self, tokens: Vec<String>) -> String
[src]
fn build_input_with_special_tokens(
&self,
tokens_1: Vec<i64>,
tokens_2: Option<Vec<i64>>,
offsets_1: Vec<Option<Offset>>,
offsets_2: Option<Vec<Option<Offset>>>,
original_offsets_1: Vec<Vec<OffsetSize>>,
original_offsets_2: Option<Vec<Vec<OffsetSize>>>,
mask_1: Vec<Mask>,
mask_2: Option<Vec<Mask>>
) -> (Vec<i64>, Vec<i8>, Vec<i8>, Vec<Option<Offset>>, Vec<Vec<OffsetSize>>, Vec<Mask>)
[src]
&self,
tokens_1: Vec<i64>,
tokens_2: Option<Vec<i64>>,
offsets_1: Vec<Option<Offset>>,
offsets_2: Option<Vec<Option<Offset>>>,
original_offsets_1: Vec<Vec<OffsetSize>>,
original_offsets_2: Option<Vec<Vec<OffsetSize>>>,
mask_1: Vec<Mask>,
mask_2: Option<Vec<Mask>>
) -> (Vec<i64>, Vec<i8>, Vec<i8>, Vec<Option<Offset>>, Vec<Vec<OffsetSize>>, Vec<Mask>)
impl Tokenizer<BertVocab> for BertTokenizer
[src]
fn vocab(&self) -> &BertVocab
[src]
fn tokenize_to_tokens(&self, initial_token: TokenRef<'_>) -> Vec<Token>
[src]
fn convert_tokens_to_string(&self, tokens: Vec<String>) -> String
[src]
fn build_input_with_special_tokens(
&self,
tokens_1: Vec<i64>,
tokens_2: Option<Vec<i64>>,
offsets_1: Vec<Option<Offset>>,
offsets_2: Option<Vec<Option<Offset>>>,
original_offsets_1: Vec<Vec<OffsetSize>>,
original_offsets_2: Option<Vec<Vec<OffsetSize>>>,
mask_1: Vec<Mask>,
mask_2: Option<Vec<Mask>>
) -> (Vec<i64>, Vec<i8>, Vec<i8>, Vec<Option<Offset>>, Vec<Vec<OffsetSize>>, Vec<Mask>)
[src]
&self,
tokens_1: Vec<i64>,
tokens_2: Option<Vec<i64>>,
offsets_1: Vec<Option<Offset>>,
offsets_2: Option<Vec<Option<Offset>>>,
original_offsets_1: Vec<Vec<OffsetSize>>,
original_offsets_2: Option<Vec<Vec<OffsetSize>>>,
mask_1: Vec<Mask>,
mask_2: Option<Vec<Mask>>
) -> (Vec<i64>, Vec<i8>, Vec<i8>, Vec<Option<Offset>>, Vec<Vec<OffsetSize>>, Vec<Mask>)
impl Tokenizer<Gpt2Vocab> for Gpt2Tokenizer
[src]
fn vocab(&self) -> &Gpt2Vocab
[src]
fn tokenize_to_tokens(&self, initial_token: TokenRef<'_>) -> Vec<Token>
[src]
fn convert_tokens_to_string(&self, tokens: Vec<String>) -> String
[src]
impl Tokenizer<MarianVocab> for MarianTokenizer
[src]
fn vocab(&self) -> &MarianVocab
[src]
fn tokenize_to_tokens(&self, text: TokenRef<'_>) -> Vec<Token>
[src]
fn convert_tokens_to_string(&self, tokens: Vec<String>) -> String
[src]
fn build_input_with_special_tokens(
&self,
tokens_1: Vec<i64>,
tokens_2: Option<Vec<i64>>,
offsets_1: Vec<Option<Offset>>,
offsets_2: Option<Vec<Option<Offset>>>,
original_offsets_1: Vec<Vec<OffsetSize>>,
original_offsets_2: Option<Vec<Vec<OffsetSize>>>,
mask_1: Vec<Mask>,
mask_2: Option<Vec<Mask>>
) -> (Vec<i64>, Vec<i8>, Vec<i8>, Vec<Option<Offset>>, Vec<Vec<OffsetSize>>, Vec<Mask>)
[src]
&self,
tokens_1: Vec<i64>,
tokens_2: Option<Vec<i64>>,
offsets_1: Vec<Option<Offset>>,
offsets_2: Option<Vec<Option<Offset>>>,
original_offsets_1: Vec<Vec<OffsetSize>>,
original_offsets_2: Option<Vec<Vec<OffsetSize>>>,
mask_1: Vec<Mask>,
mask_2: Option<Vec<Mask>>
) -> (Vec<i64>, Vec<i8>, Vec<i8>, Vec<Option<Offset>>, Vec<Vec<OffsetSize>>, Vec<Mask>)
impl Tokenizer<OpenAiGptVocab> for CtrlTokenizer
[src]
fn vocab(&self) -> &OpenAiGptVocab
[src]
fn tokenize_to_tokens(&self, initial_token: TokenRef<'_>) -> Vec<Token>
[src]
fn convert_tokens_to_string(&self, tokens: Vec<String>) -> String
[src]
impl Tokenizer<OpenAiGptVocab> for OpenAiGptTokenizer
[src]
fn vocab(&self) -> &OpenAiGptVocab
[src]
fn tokenize_to_tokens(&self, initial_token: TokenRef<'_>) -> Vec<Token>
[src]
fn convert_tokens_to_string(&self, tokens: Vec<String>) -> String
[src]
impl Tokenizer<RobertaVocab> for RobertaTokenizer
[src]
fn vocab(&self) -> &RobertaVocab
[src]
fn tokenize_to_tokens(&self, initial_token: TokenRef<'_>) -> Vec<Token>
[src]
fn convert_tokens_to_string(&self, tokens: Vec<String>) -> String
[src]
fn build_input_with_special_tokens(
&self,
tokens_1: Vec<i64>,
tokens_2: Option<Vec<i64>>,
offsets_1: Vec<Option<Offset>>,
offsets_2: Option<Vec<Option<Offset>>>,
original_offsets_1: Vec<Vec<OffsetSize>>,
original_offsets_2: Option<Vec<Vec<OffsetSize>>>,
mask_1: Vec<Mask>,
mask_2: Option<Vec<Mask>>
) -> (Vec<i64>, Vec<i8>, Vec<i8>, Vec<Option<Offset>>, Vec<Vec<OffsetSize>>, Vec<Mask>)
[src]
&self,
tokens_1: Vec<i64>,
tokens_2: Option<Vec<i64>>,
offsets_1: Vec<Option<Offset>>,
offsets_2: Option<Vec<Option<Offset>>>,
original_offsets_1: Vec<Vec<OffsetSize>>,
original_offsets_2: Option<Vec<Vec<OffsetSize>>>,
mask_1: Vec<Mask>,
mask_2: Option<Vec<Mask>>
) -> (Vec<i64>, Vec<i8>, Vec<i8>, Vec<Option<Offset>>, Vec<Vec<OffsetSize>>, Vec<Mask>)
impl Tokenizer<SentencePieceVocab> for SentencePieceTokenizer
[src]
fn vocab(&self) -> &SentencePieceVocab
[src]
fn tokenize_to_tokens(&self, text: TokenRef<'_>) -> Vec<Token>
[src]
fn convert_tokens_to_string(&self, tokens: Vec<String>) -> String
[src]
impl Tokenizer<T5Vocab> for T5Tokenizer
[src]
fn vocab(&self) -> &T5Vocab
[src]
fn tokenize_to_tokens(&self, text: TokenRef<'_>) -> Vec<Token>
[src]
fn convert_tokens_to_string(&self, tokens: Vec<String>) -> String
[src]
impl Tokenizer<XLMRobertaVocab> for XLMRobertaTokenizer
[src]
fn vocab(&self) -> &XLMRobertaVocab
[src]
fn tokenize_to_tokens(&self, text: TokenRef<'_>) -> Vec<Token>
[src]
fn build_input_with_special_tokens(
&self,
tokens_1: Vec<i64>,
tokens_2: Option<Vec<i64>>,
offsets_1: Vec<Option<Offset>>,
offsets_2: Option<Vec<Option<Offset>>>,
original_offsets_1: Vec<Vec<OffsetSize>>,
original_offsets_2: Option<Vec<Vec<OffsetSize>>>,
mask_1: Vec<Mask>,
mask_2: Option<Vec<Mask>>
) -> (Vec<i64>, Vec<i8>, Vec<i8>, Vec<Option<Offset>>, Vec<Vec<OffsetSize>>, Vec<Mask>)
[src]
&self,
tokens_1: Vec<i64>,
tokens_2: Option<Vec<i64>>,
offsets_1: Vec<Option<Offset>>,
offsets_2: Option<Vec<Option<Offset>>>,
original_offsets_1: Vec<Vec<OffsetSize>>,
original_offsets_2: Option<Vec<Vec<OffsetSize>>>,
mask_1: Vec<Mask>,
mask_2: Option<Vec<Mask>>
) -> (Vec<i64>, Vec<i8>, Vec<i8>, Vec<Option<Offset>>, Vec<Vec<OffsetSize>>, Vec<Mask>)