[−][src]Trait rust_tokenizers::preprocessing::tokenizer::base_tokenizer::Tokenizer

pub trait Tokenizer<T: Vocab> {
    fn vocab(&self) -> &T;
    fn tokenize_to_tokens(&self, text: TokenRef<'_>) -> Vec<Token>;

    fn tokenize(&self, text: &str) -> Vec<String> { ... }
    fn tokenize_with_offsets(
        &self, 
        text: &str
    ) -> (Vec<String>, Vec<Option<Offset>>, Vec<Vec<OffsetSize>>, Vec<Mask>) { ... }
    fn tokenize_list(&self, text_list: Vec<&str>) -> Vec<Vec<String>> { ... }
    fn tokenize_list_with_offsets(
        &self, 
        text_list: Vec<&str>
    ) -> Vec<(Vec<String>, Vec<Option<Offset>>, Vec<Vec<OffsetSize>>, Vec<Mask>)> { ... }
    fn convert_tokens_to_ids(&self, tokens: &Vec<String>) -> Vec<i64> { ... }
    fn encode(
        &self, 
        text_1: &str, 
        text_2: Option<&str>, 
        max_len: usize, 
        truncation_strategy: &TruncationStrategy, 
        stride: usize
    ) -> TokenizedInput { ... }
    fn encode_list(
        &self, 
        text_list: Vec<&str>, 
        max_len: usize, 
        truncation_strategy: &TruncationStrategy, 
        stride: usize
    ) -> Vec<TokenizedInput> { ... }
    fn encode_pair_list(
        &self, 
        text_list: Vec<(&str, &str)>, 
        max_len: usize, 
        truncation_strategy: &TruncationStrategy, 
        stride: usize
    ) -> Vec<TokenizedInput> { ... }
    fn decode_to_vec(
        &self, 
        token_ids: Vec<i64>, 
        skip_special_tokens: bool
    ) -> Vec<String> { ... }
    fn decode(
        &self, 
        token_ids: Vec<i64>, 
        skip_special_tokens: bool, 
        clean_up_tokenization_spaces: bool
    ) -> String { ... }
    fn convert_tokens_to_string(&self, tokens: Vec<String>) -> String { ... }
    fn clean_up_tokenization(&self, input_string: String) -> String { ... }
    fn decode_list(
        &self, 
        token_ids_list: Vec<Vec<i64>>, 
        skip_special_tokens: bool, 
        clean_up_tokenization_spaces: bool
    ) -> Vec<String> { ... }
    fn build_input_with_special_tokens(
        &self, 
        tokens_1: Vec<i64>, 
        tokens_2: Option<Vec<i64>>, 
        offsets_1: Vec<Option<Offset>>, 
        offsets_2: Option<Vec<Option<Offset>>>, 
        original_offsets_1: Vec<Vec<OffsetSize>>, 
        original_offsets_2: Option<Vec<Vec<OffsetSize>>>, 
        mask: Vec<Mask>, 
        mask_2: Option<Vec<Mask>>
    ) -> (Vec<i64>, Vec<i8>, Vec<i8>, Vec<Option<Offset>>, Vec<Vec<OffsetSize>>, Vec<Mask>) { ... }
}

Required methods

`fn vocab(&self) -> &T`

`fn tokenize_to_tokens(&self, text: TokenRef<'_>) -> Vec<Token>`

Tokenize a text, returns a vector of tokens (contains offset information and more)

Loading content...

Provided methods

`fn tokenize(&self, text: &str) -> Vec<String>`

Tokenize a string, returns a vector of tokens as strings. Use tokenize_with_offsets or tokenize_to_tokens if you also want offset information.

`fn tokenize_with_offsets( &self, text: &str ) -> (Vec<String>, Vec<Option<Offset>>, Vec<Vec<OffsetSize>>, Vec<Mask>)`

Tokenize a string, return offset information

`fn tokenize_list(&self, text_list: Vec<&str>) -> Vec<Vec<String>>`

Tokenize a vector of strings, where each corresponds to for example a sentence, returns a vector of vectors of strings. Use tokenize_list_with_offsets if you also want offset information.

`fn tokenize_list_with_offsets( &self, text_list: Vec<&str> ) -> Vec<(Vec<String>, Vec<Option<Offset>>, Vec<Vec<OffsetSize>>, Vec<Mask>)>`

Tokenize a vector of strings, where each corresponds to for example a sentence, returns a vector of pairs consists of a vector of tokens and a list of offset information.

`fn convert_tokens_to_ids(&self, tokens: &Vec<String>) -> Vec<i64>`

`fn encode( &self, text_1: &str, text_2: Option<&str>, max_len: usize, truncation_strategy: &TruncationStrategy, stride: usize ) -> TokenizedInput`

`fn encode_list( &self, text_list: Vec<&str>, max_len: usize, truncation_strategy: &TruncationStrategy, stride: usize ) -> Vec<TokenizedInput>`

`fn encode_pair_list( &self, text_list: Vec<(&str, &str)>, max_len: usize, truncation_strategy: &TruncationStrategy, stride: usize ) -> Vec<TokenizedInput>`

`fn decode_to_vec( &self, token_ids: Vec<i64>, skip_special_tokens: bool ) -> Vec<String>`

`fn decode( &self, token_ids: Vec<i64>, skip_special_tokens: bool, clean_up_tokenization_spaces: bool ) -> String`

Converts a sequence of ids (integer) into astring, using the tokenizer and vocabulary with options to remove special tokens and clean up tokenization spaces. Args:

token_ids: list of tokenized input ids. Can be obtained using the encode or encode_plus methods.
skip_special_tokens: if set to True, will replace special tokens.
clean_up_tokenization_spaces: if set to True, will clean up the tokenization spaces.

`fn convert_tokens_to_string(&self, tokens: Vec<String>) -> String`

`fn clean_up_tokenization(&self, input_string: String) -> String`

`fn decode_list( &self, token_ids_list: Vec<Vec<i64>>, skip_special_tokens: bool, clean_up_tokenization_spaces: bool ) -> Vec<String>`

`fn build_input_with_special_tokens( &self, tokens_1: Vec<i64>, tokens_2: Option<Vec<i64>>, offsets_1: Vec<Option<Offset>>, offsets_2: Option<Vec<Option<Offset>>>, original_offsets_1: Vec<Vec<OffsetSize>>, original_offsets_2: Option<Vec<Vec<OffsetSize>>>, mask: Vec<Mask>, mask_2: Option<Vec<Mask>> ) -> (Vec<i64>, Vec<i8>, Vec<i8>, Vec<Option<Offset>>, Vec<Vec<OffsetSize>>, Vec<Mask>)`

Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and adding special tokens. A RoBERTa sequence has the following format: single sequence: X pair of sequences: A B

Returns a tuple of:

output token IDs
token segment IDs
special token mask
offsets (as a vector of Option<Offset> because some added markers may not have associated offsets
token mask

Loading content...