[][src]Trait rust_tokenizers::preprocessing::tokenizer::base_tokenizer::Tokenizer

pub trait Tokenizer<T: Vocab> {
    fn vocab(&self) -> &T;
fn tokenize_to_tokens(&self, text: TokenRef) -> Vec<Token>; fn tokenize(&self, text: &str) -> Vec<String> { ... }
fn tokenize_with_offsets(
        &self,
        text: &str
    ) -> (Vec<String>, Vec<Option<Offset>>, Vec<Vec<OffsetSize>>, Vec<Mask>) { ... }
fn tokenize_list(&self, text_list: Vec<&str>) -> Vec<Vec<String>> { ... }
fn tokenize_list_with_offsets(
        &self,
        text_list: Vec<&str>
    ) -> Vec<(Vec<String>, Vec<Option<Offset>>, Vec<Vec<OffsetSize>>, Vec<Mask>)> { ... }
fn convert_tokens_to_ids(&self, tokens: &Vec<String>) -> Vec<i64> { ... }
fn encode(
        &self,
        text_1: &str,
        text_2: Option<&str>,
        max_len: usize,
        truncation_strategy: &TruncationStrategy,
        stride: usize
    ) -> TokenizedInput { ... }
fn encode_list(
        &self,
        text_list: Vec<&str>,
        max_len: usize,
        truncation_strategy: &TruncationStrategy,
        stride: usize
    ) -> Vec<TokenizedInput> { ... }
fn encode_pair_list(
        &self,
        text_list: Vec<(&str, &str)>,
        max_len: usize,
        truncation_strategy: &TruncationStrategy,
        stride: usize
    ) -> Vec<TokenizedInput> { ... }
fn decode_to_vec(
        &self,
        token_ids: Vec<i64>,
        skip_special_tokens: bool
    ) -> Vec<String> { ... }
fn decode(
        &self,
        token_ids: Vec<i64>,
        skip_special_tokens: bool,
        clean_up_tokenization_spaces: bool
    ) -> String { ... }
fn convert_tokens_to_string(&self, tokens: Vec<String>) -> String { ... }
fn clean_up_tokenization(&self, input_string: String) -> String { ... }
fn decode_list(
        &self,
        token_ids_list: Vec<Vec<i64>>,
        skip_special_tokens: bool,
        clean_up_tokenization_spaces: bool
    ) -> Vec<String> { ... }
fn build_input_with_special_tokens(
        &self,
        tokens_1: Vec<i64>,
        tokens_2: Option<Vec<i64>>,
        offsets_1: Vec<Option<Offset>>,
        offsets_2: Option<Vec<Option<Offset>>>,
        original_offsets_1: Vec<Vec<OffsetSize>>,
        original_offsets_2: Option<Vec<Vec<OffsetSize>>>,
        mask: Vec<Mask>,
        mask_2: Option<Vec<Mask>>
    ) -> (Vec<i64>, Vec<i8>, Vec<i8>, Vec<Option<Offset>>, Vec<Vec<OffsetSize>>, Vec<Mask>) { ... } }

Required methods

fn vocab(&self) -> &T

fn tokenize_to_tokens(&self, text: TokenRef) -> Vec<Token>

Tokenize a text, returns a vector of tokens (contains offset information and more)

Loading content...

Provided methods

fn tokenize(&self, text: &str) -> Vec<String>

Tokenize a string, returns a vector of tokens as strings. Use tokenize_with_offsets or tokenize_to_tokens if you also want offset information.

fn tokenize_with_offsets(
    &self,
    text: &str
) -> (Vec<String>, Vec<Option<Offset>>, Vec<Vec<OffsetSize>>, Vec<Mask>)

Tokenize a string, return offset information

fn tokenize_list(&self, text_list: Vec<&str>) -> Vec<Vec<String>>

Tokenize a vector of strings, where each corresponds to for example a sentence, returns a vector of vectors of strings. Use tokenize_list_with_offsets if you also want offset information.

fn tokenize_list_with_offsets(
    &self,
    text_list: Vec<&str>
) -> Vec<(Vec<String>, Vec<Option<Offset>>, Vec<Vec<OffsetSize>>, Vec<Mask>)>

Tokenize a vector of strings, where each corresponds to for example a sentence, returns a vector of pairs consists of a vector of tokens and a list of offset information.

fn convert_tokens_to_ids(&self, tokens: &Vec<String>) -> Vec<i64>

fn encode(
    &self,
    text_1: &str,
    text_2: Option<&str>,
    max_len: usize,
    truncation_strategy: &TruncationStrategy,
    stride: usize
) -> TokenizedInput

fn encode_list(
    &self,
    text_list: Vec<&str>,
    max_len: usize,
    truncation_strategy: &TruncationStrategy,
    stride: usize
) -> Vec<TokenizedInput>

fn encode_pair_list(
    &self,
    text_list: Vec<(&str, &str)>,
    max_len: usize,
    truncation_strategy: &TruncationStrategy,
    stride: usize
) -> Vec<TokenizedInput>

fn decode_to_vec(
    &self,
    token_ids: Vec<i64>,
    skip_special_tokens: bool
) -> Vec<String>

fn decode(
    &self,
    token_ids: Vec<i64>,
    skip_special_tokens: bool,
    clean_up_tokenization_spaces: bool
) -> String

Converts a sequence of ids (integer) into astring, using the tokenizer and vocabulary with options to remove special tokens and clean up tokenization spaces. Args:

  • token_ids: list of tokenized input ids. Can be obtained using the encode or encode_plus methods.
  • skip_special_tokens: if set to True, will replace special tokens.
  • clean_up_tokenization_spaces: if set to True, will clean up the tokenization spaces.

fn convert_tokens_to_string(&self, tokens: Vec<String>) -> String

fn clean_up_tokenization(&self, input_string: String) -> String

fn decode_list(
    &self,
    token_ids_list: Vec<Vec<i64>>,
    skip_special_tokens: bool,
    clean_up_tokenization_spaces: bool
) -> Vec<String>

fn build_input_with_special_tokens(
    &self,
    tokens_1: Vec<i64>,
    tokens_2: Option<Vec<i64>>,
    offsets_1: Vec<Option<Offset>>,
    offsets_2: Option<Vec<Option<Offset>>>,
    original_offsets_1: Vec<Vec<OffsetSize>>,
    original_offsets_2: Option<Vec<Vec<OffsetSize>>>,
    mask: Vec<Mask>,
    mask_2: Option<Vec<Mask>>
) -> (Vec<i64>, Vec<i8>, Vec<i8>, Vec<Option<Offset>>, Vec<Vec<OffsetSize>>, Vec<Mask>)

Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and adding special tokens. A RoBERTa sequence has the following format: single sequence: X pair of sequences: A B

Returns a tuple of:

  • output token IDs
  • token segment IDs
  • special token mask
  • offsets (as a vector of Option<Offset> because some added markers may not have associated offsets
  • token mask
Loading content...

Implementors

impl Tokenizer<BertVocab> for BertTokenizer[src]

impl Tokenizer<Gpt2Vocab> for Gpt2Tokenizer[src]

impl Tokenizer<OpenAiGptVocab> for CtrlTokenizer[src]

impl Tokenizer<OpenAiGptVocab> for OpenAiGptTokenizer[src]

impl Tokenizer<RobertaVocab> for RobertaTokenizer[src]

impl<T: Vocab + Sync + Send> Tokenizer<T> for BaseTokenizer<T>[src]

Loading content...