[−][src]Trait rust_tokenizers::tokenizer::Tokenizer

pub trait Tokenizer<T: Vocab> {
    fn vocab(&self) -> &T;
    fn tokenize_to_tokens(&self, text: TokenRef<'_>) -> Vec<Token>;

    fn tokenize<S: AsRef<str>>(&self, text: S) -> Vec<String> { ... }
    fn tokenize_with_offsets<S: AsRef<str>>(&self, text: S) -> TokensWithOffsets { ... }
    fn tokenize_list<S, ST>(&self, text_list: S) -> Vec<Vec<String>>
    where
        S: AsRef<[ST]>,
        ST: AsRef<str>,
    { ... }
    fn tokenize_list_with_offsets<S, ST>(
        &self, 
        text_list: S
    ) -> Vec<TokensWithOffsets>
    where
        S: AsRef<[ST]>,
        ST: AsRef<str>,
    { ... }
    fn convert_tokens_to_ids<S, ST>(&self, tokens: S) -> Vec<i64>
    where
        S: AsRef<[ST]>,
        ST: AsRef<str>,
    { ... }
    fn encode<S: AsRef<str>>(
        &self, 
        text_1: S, 
        text_2: Option<S>, 
        max_len: usize, 
        truncation_strategy: &TruncationStrategy, 
        stride: usize
    ) -> TokenizedInput { ... }
    fn encode_list<S, ST>(
        &self, 
        text_list: S, 
        max_len: usize, 
        truncation_strategy: &TruncationStrategy, 
        stride: usize
    ) -> Vec<TokenizedInput>
    where
        S: AsRef<[ST]>,
        ST: AsRef<str>,
    { ... }
    fn encode_pair_list<S, ST>(
        &self, 
        text_list: S, 
        max_len: usize, 
        truncation_strategy: &TruncationStrategy, 
        stride: usize
    ) -> Vec<TokenizedInput>
    where
        S: AsRef<[(ST, ST)]>,
        ST: AsRef<str>,
    { ... }
    fn decode_to_vec(
        &self, 
        token_ids: Vec<i64>, 
        skip_special_tokens: bool
    ) -> Vec<String> { ... }
    fn decode(
        &self, 
        token_ids: Vec<i64>, 
        skip_special_tokens: bool, 
        clean_up_tokenization_spaces: bool
    ) -> String { ... }
    fn convert_tokens_to_string(&self, tokens: Vec<String>) -> String { ... }
    fn clean_up_tokenization(&self, input_string: String) -> String { ... }
    fn decode_list(
        &self, 
        token_ids_list: Vec<Vec<i64>>, 
        skip_special_tokens: bool, 
        clean_up_tokenization_spaces: bool
    ) -> Vec<String> { ... }
    fn build_input_with_special_tokens(
        &self, 
        tokens_ids_with_offsets_1: TokenIdsWithOffsets, 
        tokens_ids_with_offsets_2: Option<TokenIdsWithOffsets>
    ) -> TokenIdsWithSpecialTokens { ... }
}

Base trait for tokenizers

Required methods

`fn vocab(&self) -> &T`

returns a reference to the tokenizer vocabulary

`fn tokenize_to_tokens(&self, text: TokenRef<'_>) -> Vec<Token>`

Tokenize a TokenRef, returning a sequence of tokens

Parameters

text (TokenRef): TokenRef to tokenize (this is especially useful for nested tokenization, where a tokenizer is called on the ouput of a pre-tokenizer, such as BERT).

Returns

Vec<Token> tokenization of the original TokenRef

Example

use itertools::Itertools;
use rust_tokenizers::tokenizer::{BaseTokenizer, Tokenizer};
use rust_tokenizers::vocab::BaseVocab;
use rust_tokenizers::{OffsetSize, TokenRef};
let strip_accents = false;
let lower_case = false;
let tokenizer: BaseTokenizer<BaseVocab> =
    BaseTokenizer::from_file("path/to/vocab/file", lower_case, strip_accents).unwrap();

let text = "Hello, world!";
let offsets = (0..text.len() as OffsetSize).collect_vec();
let text = TokenRef::new(text, &offsets);
let tokens = tokenizer.tokenize_to_tokens(text);

Loading content...

Provided methods

`fn tokenize<S: AsRef<str>>(&self, text: S) -> Vec<String>`

Tokenize a string, returns a vector of tokens as strings. Use tokenize_with_offsets or tokenize_to_tokens to return offset information.

Parameters

text : text (string-like) to tokenize

Returns

Vec<String> containing the tokens string representation

Example

use rust_tokenizers::tokenizer::{BaseTokenizer, Tokenizer};
use rust_tokenizers::vocab::BaseVocab;
let strip_accents = false;
let lower_case = false;
let tokenizer: BaseTokenizer<BaseVocab> =
    BaseTokenizer::from_file("path/to/vocab/file", lower_case, strip_accents).unwrap();

let text = "Hello, world!";
let tokens = tokenizer.tokenize(text);

`fn tokenize_with_offsets<S: AsRef<str>>(&self, text: S) -> TokensWithOffsets`

Tokenize a string, returning tokens with offset information

Parameters

text : text (string-like) to tokenize

Returns

TokensWithOffsets with the tokens and their offset information

Example

use rust_tokenizers::tokenizer::{BaseTokenizer, Tokenizer};
use rust_tokenizers::vocab::BaseVocab;
let strip_accents = false;
let lower_case = false;
let tokenizer: BaseTokenizer<BaseVocab> =
    BaseTokenizer::from_file("path/to/vocab/file", lower_case, strip_accents).unwrap();

let text = "Hello, world!";
let tokens = tokenizer.tokenize_with_offsets(text);

`fn tokenize_list<S, ST>(&self, text_list: S) -> Vec<Vec<String>> where S: AsRef<[ST]>, ST: AsRef<str>,`

Tokenize a list of strings, returning tokens with offset information

Parameters

text_list: list of strings to tokenize

Returns

Vec<Vec<String>> with the token strings representation

Example

use rust_tokenizers::tokenizer::{BaseTokenizer, Tokenizer};
use rust_tokenizers::vocab::BaseVocab;
let strip_accents = false;
let lower_case = false;
let tokenizer: BaseTokenizer<BaseVocab> =
    BaseTokenizer::from_file("path/to/vocab/file", lower_case, strip_accents).unwrap();

let texts = ["Hello, world!", "Second sentence"];
let tokens = tokenizer.tokenize_list(&texts);

`fn tokenize_list_with_offsets<S, ST>( &self, text_list: S ) -> Vec<TokensWithOffsets> where S: AsRef<[ST]>, ST: AsRef<str>,`

Tokenize a list of strings, where each corresponds to for example a sentence, returns a vector of TokensWithOffsets containing the tokens and their offset information. This calls tokenize_with_offsets on the list provided.

Parameters

text_list: list of strings to tokenize

Returns

Vec<TokensWithOffsets> with the token strings representation and offsets

Example

use rust_tokenizers::tokenizer::{BaseTokenizer, Tokenizer};
use rust_tokenizers::vocab::BaseVocab;
let strip_accents = false;
let lower_case = false;
let tokenizer: BaseTokenizer<BaseVocab> =
    BaseTokenizer::from_file("path/to/vocab/file", lower_case, strip_accents).unwrap();

let text = ["Hello, world!", "Second sentence"];
let tokens = tokenizer.tokenize_list_with_offsets(&text);

`fn convert_tokens_to_ids<S, ST>(&self, tokens: S) -> Vec<i64> where S: AsRef<[ST]>, ST: AsRef<str>,`

Convert a slice of string-like to a vector ot token indices

Parameters

tokens: list of token string-like to convert to ids

Returns

Vec<i64> with the token indices

Example

use rust_tokenizers::tokenizer::{BaseTokenizer, Tokenizer};
use rust_tokenizers::vocab::BaseVocab;
let strip_accents = false;
let lower_case = false;
let tokenizer: BaseTokenizer<BaseVocab> =
    BaseTokenizer::from_file("path/to/vocab/file", lower_case, strip_accents).unwrap();

let tokens = ["Hello", ",", "world", "!"];
let token_ids = tokenizer.convert_tokens_to_ids(&tokens);

`fn encode<S: AsRef<str>>( &self, text_1: S, text_2: Option<S>, max_len: usize, truncation_strategy: &TruncationStrategy, stride: usize ) -> TokenizedInput`

Encode a string-like (tokenization followed by encoding)

Parameters

text_1: input text (string-like) to encode
text_2: optional additional input text (string-like) to encode. When provided, both texts are combined into a single encoding by using the build_input_with_special_tokens method.
max_len (usize): maximum combined sequence length. If the combined encoding would exceed this max_len, the encoding is truncated following the TruncationStrategy provided.
truncation_strategy (&TruncationStrategy): strategy to follow for the truncation, if required
stride (usize): amount of tokens to shift the input by if truncation is required (allowing for the generation of overlapping sequences with overflowing tokens)

Returns

TokenizedInput containing the encoding output (token indices, token types, segment ids, ovrflowing tokens and special token mask)

Example

use rust_tokenizers::tokenizer::{BaseTokenizer, Tokenizer, TruncationStrategy};
use rust_tokenizers::vocab::BaseVocab;
let strip_accents = false;
let lower_case = false;
let tokenizer: BaseTokenizer<BaseVocab> =
    BaseTokenizer::from_file("path/to/vocab/file", lower_case, strip_accents).unwrap();

let text_1 = "Hello, world!";
let text_2 = "How is it going?";
let encoded_input = tokenizer.encode(
    text_1,
    Some(text_2),
    5,
    &TruncationStrategy::LongestFirst,
    2,
);

`fn encode_list<S, ST>( &self, text_list: S, max_len: usize, truncation_strategy: &TruncationStrategy, stride: usize ) -> Vec<TokenizedInput> where S: AsRef<[ST]>, ST: AsRef<str>,`

Encode a sequence of string-like texts (tokenization followed by encoding). Not that in contrast with encode optional second text, each text provided is encoded independently.

Parameters

text_list: sequence of input text (&str) to encode combined into a single encoding by using the build_input_with_special_tokens method.
max_len (usize): maximum combined sequence length. If the combined encoding would exceed this max_len, the encoding is truncated following the TruncationStrategy provided.
truncation_strategy (&TruncationStrategy): strategy to follow for the truncation, if required
stride (usize): amount of tokens to shift the input by if truncation is required (allowing for the generation of overlapping sequences with overflowing tokens)

Returns

Vec<TokenizedInput> containing the encoding output (token indices, token types, segment ids, ovrflowing tokens and special token mask) for each provided text

Example

use rust_tokenizers::tokenizer::{BaseTokenizer, Tokenizer, TruncationStrategy};
use rust_tokenizers::vocab::BaseVocab;
let strip_accents = false;
let lower_case = false;
let tokenizer: BaseTokenizer<BaseVocab> =
    BaseTokenizer::from_file("path/to/vocab/file", lower_case, strip_accents).unwrap();

let text_1 = "Hello, world!";
let text_2 = "How is it going?";
let text_3 = "Very well thank you.";
let encoded_input = tokenizer.encode_list(
    [text_1, text_2, text_3],
    5,
    &TruncationStrategy::LongestFirst,
    2,
);

`fn encode_pair_list<S, ST>( &self, text_list: S, max_len: usize, truncation_strategy: &TruncationStrategy, stride: usize ) -> Vec<TokenizedInput> where S: AsRef<[(ST, ST)]>, ST: AsRef<str>,`

Encode a sequence of string-like text pairs (tokenization followed by encoding). This combines with encode with the list processing of encode_list.

Parameters

text_list: sequence of input text (&str) to encode combined into a single encoding by using the build_input_with_special_tokens method.
max_len (usize): maximum combined sequence length. If the combined encoding would exceed this max_len, the encoding is truncated following the TruncationStrategy provided.
truncation_strategy (&TruncationStrategy): strategy to follow for the truncation, if required
stride (usize): amount of tokens to shift the input by if truncation is required (allowing for the generation of overlapping sequences with overflowing tokens)

Returns

Vec<TokenizedInput> containing the encoding output (token indices, token types, segment ids, ovrflowing tokens and special token mask) for each provided text

Example

use rust_tokenizers::tokenizer::{BaseTokenizer, Tokenizer, TruncationStrategy};
use rust_tokenizers::vocab::BaseVocab;
let strip_accents = false;
let lower_case = false;
let tokenizer: BaseTokenizer<BaseVocab> =
    BaseTokenizer::from_file("path/to/vocab/file", lower_case, strip_accents).unwrap();

let text_1 = "Hello, world!";
let text_2 = "This is a second sentence";
let text_3 = "Very well thank you.";
let text_4 = "This is another second sentence.";
let encoded_input = tokenizer.encode_pair_list(
    [(text_1, text_2), (text_3, text_4)],
    5,
    &TruncationStrategy::LongestFirst,
    2,
);

`fn decode_to_vec( &self, token_ids: Vec<i64>, skip_special_tokens: bool ) -> Vec<String>`

Decode a sequence of token indices to a sequence of Strings, optionally skipping special indices

Parameters

token_ids (Vec<i64>): tokens to decode
skip_special_tokens (bool): flag indicating if special tokens should be included in the output

Returns

Vec<String> decoded token indices

Example

use rust_tokenizers::tokenizer::{BaseTokenizer, Tokenizer, TruncationStrategy};
use rust_tokenizers::vocab::BaseVocab;
let strip_accents = false;
let lower_case = false;
let tokenizer: BaseTokenizer<BaseVocab> =
    BaseTokenizer::from_file("path/to/vocab/file", lower_case, strip_accents).unwrap();

let tokens_ids = vec![0, 1, 2, 42];
let tokens = tokenizer.decode_to_vec(tokens_ids, false);

`fn decode( &self, token_ids: Vec<i64>, skip_special_tokens: bool, clean_up_tokenization_spaces: bool ) -> String`

Converts a sequence of ids (integer) into a string, using the tokenizer and vocabulary with options to remove special tokens and clean up tokenization spaces.

Arguments

token_ids: list of tokenized input ids. Can be obtained using the encode or encode_plus methods.
skip_special_tokens: if set to True, will replace special tokens.
clean_up_tokenization_spaces: if set to True, will clean up the tokenization spaces.

Returns

String: decoded sentence

Example

use rust_tokenizers::tokenizer::{BaseTokenizer, Tokenizer, TruncationStrategy};
use rust_tokenizers::vocab::BaseVocab;
let strip_accents = false;
let lower_case = false;
let tokenizer: BaseTokenizer<BaseVocab> =
    BaseTokenizer::from_file("path/to/vocab/file", lower_case, strip_accents).unwrap();

let skip_special_tokens = true;
let clean_up_tokenization_spaces = true;
let tokens = vec![0, 1, 2, 42];
let decoded = tokenizer.decode(tokens, skip_special_tokens, clean_up_tokenization_spaces);

`fn convert_tokens_to_string(&self, tokens: Vec<String>) -> String`

Converts a sequence of strings into a single string. This will clean-up artifacts from tokenization (for example sub ##word) and generate a single output string

Arguments

tokens: list of tokens to concatenate.

Returns

String: concatenated sentence string

Example

use rust_tokenizers::tokenizer::{BaseTokenizer, Tokenizer, TruncationStrategy};
use rust_tokenizers::vocab::BaseVocab;
let strip_accents = false;
let lower_case = false;
let tokenizer: BaseTokenizer<BaseVocab> =
    BaseTokenizer::from_file("path/to/vocab/file", lower_case, strip_accents).unwrap();

let skip_special_tokens = true;
let clean_up_tokenization_spaces = true;
let tokens = vec![
    "Hello".to_string(),
    ",".to_string(),
    "World".to_string(),
    "!".to_string(),
];
let decoded = tokenizer.convert_tokens_to_string(tokens);

`fn clean_up_tokenization(&self, input_string: String) -> String`

Cleans-up tokenization artifacts (for example whitespace before punctuation)

Arguments

input_string (String): input string to clean up

Returns

String: clean-up string

Example

use rust_tokenizers::tokenizer::{BaseTokenizer, Tokenizer, TruncationStrategy};
use rust_tokenizers::vocab::BaseVocab;
let strip_accents = false;
let lower_case = false;
let tokenizer: BaseTokenizer<BaseVocab> =
    BaseTokenizer::from_file("path/to/vocab/file", lower_case, strip_accents).unwrap();

let skip_special_tokens = true;
let clean_up_tokenization_spaces = true;
let input_string = "Hello . Do n't pay attention to the punctuation .".to_string();
let cleaned_string = tokenizer.clean_up_tokenization(input_string);

`fn decode_list( &self, token_ids_list: Vec<Vec<i64>>, skip_special_tokens: bool, clean_up_tokenization_spaces: bool ) -> Vec<String>`

Converts a list of sequence of ids (integer) into a string, using the tokenizer and vocabulary with options to remove special tokens and clean up tokenization spaces. This calls decode for each provided sequence of ids

Arguments

token_ids: list of list of tokenized input ids. Can be obtained using the encode or encode_plus methods.
skip_special_tokens: if set to True, will replace special tokens.
clean_up_tokenization_spaces: if set to True, will clean up the tokenization spaces.

Returns

String: decoded sentence

Example

use rust_tokenizers::tokenizer::{BaseTokenizer, Tokenizer, TruncationStrategy};
use rust_tokenizers::vocab::BaseVocab;
let strip_accents = false;
let lower_case = false;
let tokenizer: BaseTokenizer<BaseVocab> =
    BaseTokenizer::from_file("path/to/vocab/file", lower_case, strip_accents).unwrap();

let skip_special_tokens = true;
let clean_up_tokenization_spaces = true;
let token_ids_list = vec![vec![0, 1, 2, 42], vec![99, 3]];
let decoded_list = tokenizer.decode_list(
    token_ids_list,
    skip_special_tokens,
    clean_up_tokenization_spaces,
);

`fn build_input_with_special_tokens( &self, tokens_ids_with_offsets_1: TokenIdsWithOffsets, tokens_ids_with_offsets_2: Option<TokenIdsWithOffsets> ) -> TokenIdsWithSpecialTokens`

Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and adding special tokens.

For example, a RoBERTa sequence has the following format:

single sequence: X
pair of sequences: A B

Parameters

tokens_ids_with_offsets_1 (TokenIdsWithOffsets): first sequence
tokens_ids_with_offsets_2 (TokenIdsWithOffsets): (optional) second sequence

Returns

TokenIdsWithSpecialTokens containing a concatenation of both sequences with added special tokens

Example

use rust_tokenizers::tokenizer::{BaseTokenizer, Tokenizer, TruncationStrategy};
use rust_tokenizers::vocab::BaseVocab;
use rust_tokenizers::TokenIdsWithOffsets;
let strip_accents = false;
let lower_case = false;
let tokenizer: BaseTokenizer<BaseVocab> =
    BaseTokenizer::from_file("path/to/vocab/file", lower_case, strip_accents).unwrap();

let skip_special_tokens = true;
let clean_up_tokenization_spaces = true;
let first_sequence = "Hello, world";
let second_sequence = "This is the second sentence";

let first_tokens = tokenizer.tokenize_with_offsets(first_sequence);
let first_ids = tokenizer.convert_tokens_to_ids(first_tokens.tokens);
let first_input = TokenIdsWithOffsets {
    ids: first_ids,
    offsets: first_tokens.offsets,
    reference_offsets: first_tokens.reference_offsets,
    masks: first_tokens.masks,
};

let second_tokens = tokenizer.tokenize_with_offsets(second_sequence);
let second_ids = tokenizer.convert_tokens_to_ids(second_tokens.tokens);
let second_input = TokenIdsWithOffsets {
    ids: second_ids,
    offsets: second_tokens.offsets,
    reference_offsets: second_tokens.reference_offsets,
    masks: second_tokens.masks,
};

let combined_with_special_tokens =
    tokenizer.build_input_with_special_tokens(first_input, Some(second_input));

Loading content...

Implementors

`impl Tokenizer<AlbertVocab> for AlbertTokenizer`[src]

`fn vocab(&self) -> &AlbertVocab`[src]

`fn tokenize_to_tokens(&self, text: TokenRef<'_>) -> Vec<Token>`[src]

`fn convert_tokens_to_string(&self, tokens: Vec<String>) -> String`[src]

`fn build_input_with_special_tokens( &self, tokens_ids_with_offsets_1: TokenIdsWithOffsets, tokens_ids_with_offsets_2: Option<TokenIdsWithOffsets> ) -> TokenIdsWithSpecialTokens`[src]

`impl Tokenizer<BertVocab> for BertTokenizer`[src]

`fn vocab(&self) -> &BertVocab`[src]

`fn tokenize_to_tokens(&self, initial_token: TokenRef<'_>) -> Vec<Token>`[src]

`fn convert_tokens_to_string(&self, tokens: Vec<String>) -> String`[src]

`fn build_input_with_special_tokens( &self, tokens_ids_with_offsets_1: TokenIdsWithOffsets, tokens_ids_with_offsets_2: Option<TokenIdsWithOffsets> ) -> TokenIdsWithSpecialTokens`[src]

`impl Tokenizer<Gpt2Vocab> for Gpt2Tokenizer`[src]

`fn vocab(&self) -> &Gpt2Vocab`[src]

`fn tokenize_to_tokens(&self, initial_token: TokenRef<'_>) -> Vec<Token>`[src]

`fn convert_tokens_to_string(&self, tokens: Vec<String>) -> String`[src]

`impl Tokenizer<MarianVocab> for MarianTokenizer`[src]

`fn vocab(&self) -> &MarianVocab`[src]

`fn tokenize_to_tokens(&self, text: TokenRef<'_>) -> Vec<Token>`[src]

`fn convert_tokens_to_string(&self, tokens: Vec<String>) -> String`[src]

`fn build_input_with_special_tokens( &self, tokens_ids_with_offsets_1: TokenIdsWithOffsets, tokens_ids_with_offsets_2: Option<TokenIdsWithOffsets> ) -> TokenIdsWithSpecialTokens`[src]

`impl Tokenizer<OpenAiGptVocab> for CtrlTokenizer`[src]

`fn vocab(&self) -> &OpenAiGptVocab`[src]

`fn tokenize_to_tokens(&self, initial_token: TokenRef<'_>) -> Vec<Token>`[src]

`fn convert_tokens_to_string(&self, tokens: Vec<String>) -> String`[src]

`impl Tokenizer<OpenAiGptVocab> for OpenAiGptTokenizer`[src]

`fn vocab(&self) -> &OpenAiGptVocab`[src]

`fn tokenize_to_tokens(&self, initial_token: TokenRef<'_>) -> Vec<Token>`[src]

`fn convert_tokens_to_string(&self, tokens: Vec<String>) -> String`[src]

`impl Tokenizer<ReformerVocab> for ReformerTokenizer`[src]

`fn vocab(&self) -> &ReformerVocab`[src]

`fn tokenize_to_tokens(&self, text: TokenRef<'_>) -> Vec<Token>`[src]

`fn convert_tokens_to_string(&self, tokens: Vec<String>) -> String`[src]

`impl Tokenizer<RobertaVocab> for RobertaTokenizer`[src]

`fn vocab(&self) -> &RobertaVocab`[src]

`fn tokenize_to_tokens(&self, initial_token: TokenRef<'_>) -> Vec<Token>`[src]

`fn convert_tokens_to_string(&self, tokens: Vec<String>) -> String`[src]

`fn build_input_with_special_tokens( &self, tokens_ids_with_offsets_1: TokenIdsWithOffsets, tokens_ids_with_offsets_2: Option<TokenIdsWithOffsets> ) -> TokenIdsWithSpecialTokens`[src]

`impl Tokenizer<SentencePieceVocab> for SentencePieceTokenizer`[src]

`fn vocab(&self) -> &SentencePieceVocab`[src]

`fn tokenize_to_tokens(&self, text: TokenRef<'_>) -> Vec<Token>`[src]

`fn convert_tokens_to_string(&self, tokens: Vec<String>) -> String`[src]

`impl Tokenizer<T5Vocab> for T5Tokenizer`[src]

`fn vocab(&self) -> &T5Vocab`[src]

`fn tokenize_to_tokens(&self, text: TokenRef<'_>) -> Vec<Token>`[src]

`fn convert_tokens_to_string(&self, tokens: Vec<String>) -> String`[src]

`impl Tokenizer<XLMRobertaVocab> for XLMRobertaTokenizer`[src]

`fn vocab(&self) -> &XLMRobertaVocab`[src]

`fn tokenize_to_tokens(&self, text: TokenRef<'_>) -> Vec<Token>`[src]

`fn convert_tokens_to_string(&self, tokens: Vec<String>) -> String`[src]

`fn build_input_with_special_tokens( &self, tokens_ids_with_offsets_1: TokenIdsWithOffsets, tokens_ids_with_offsets_2: Option<TokenIdsWithOffsets> ) -> TokenIdsWithSpecialTokens`[src]

`impl Tokenizer<XLNetVocab> for XLNetTokenizer`[src]

`fn vocab(&self) -> &XLNetVocab`[src]

`fn tokenize_to_tokens(&self, text: TokenRef<'_>) -> Vec<Token>`[src]

`fn convert_tokens_to_string(&self, tokens: Vec<String>) -> String`[src]

`fn build_input_with_special_tokens( &self, tokens_ids_with_offsets_1: TokenIdsWithOffsets, tokens_ids_with_offsets_2: Option<TokenIdsWithOffsets> ) -> TokenIdsWithSpecialTokens`[src]

`impl<T: Vocab + Sync + Send> Tokenizer<T> for BaseTokenizer<T>`[src]

`fn vocab(&self) -> &T`[src]

`fn tokenize_to_tokens(&self, initial_token: TokenRef<'_>) -> Vec<Token>`[src]

Loading content...

[−][src]Trait rust_tokenizers::tokenizer::Tokenizer

Required methods

fn vocab(&self) -> &T

fn tokenize_to_tokens(&self, text: TokenRef<'_>) -> Vec<Token>

Provided methods

fn tokenize<S: AsRef<str>>(&self, text: S) -> Vec<String>

fn tokenize_with_offsets<S: AsRef<str>>(&self, text: S) -> TokensWithOffsets

fn tokenize_list<S, ST>(&self, text_list: S) -> Vec<Vec<String>> where S: AsRef<[ST]>, ST: AsRef<str>,

fn tokenize_list_with_offsets<S, ST>( &self, text_list: S) -> Vec<TokensWithOffsets> where S: AsRef<[ST]>, ST: AsRef<str>,

fn convert_tokens_to_ids<S, ST>(&self, tokens: S) -> Vec<i64> where S: AsRef<[ST]>, ST: AsRef<str>,

fn encode<S: AsRef<str>>( &self, text_1: S, text_2: Option<S>, max_len: usize, truncation_strategy: &TruncationStrategy, stride: usize) -> TokenizedInput

fn encode_list<S, ST>( &self, text_list: S, max_len: usize, truncation_strategy: &TruncationStrategy, stride: usize) -> Vec<TokenizedInput> where S: AsRef<[ST]>, ST: AsRef<str>,

fn encode_pair_list<S, ST>( &self, text_list: S, max_len: usize, truncation_strategy: &TruncationStrategy, stride: usize) -> Vec<TokenizedInput> where S: AsRef<[(ST, ST)]>, ST: AsRef<str>,

fn decode_to_vec( &self, token_ids: Vec<i64>, skip_special_tokens: bool) -> Vec<String>

fn decode( &self, token_ids: Vec<i64>, skip_special_tokens: bool, clean_up_tokenization_spaces: bool) -> String

fn convert_tokens_to_string(&self, tokens: Vec<String>) -> String

fn clean_up_tokenization(&self, input_string: String) -> String

fn decode_list( &self, token_ids_list: Vec<Vec<i64>>, skip_special_tokens: bool, clean_up_tokenization_spaces: bool) -> Vec<String>

fn build_input_with_special_tokens( &self, tokens_ids_with_offsets_1: TokenIdsWithOffsets, tokens_ids_with_offsets_2: Option<TokenIdsWithOffsets>) -> TokenIdsWithSpecialTokens

Implementors

impl Tokenizer<AlbertVocab> for AlbertTokenizer[src]

fn vocab(&self) -> &AlbertVocab[src]

fn tokenize_to_tokens(&self, text: TokenRef<'_>) -> Vec<Token>[src]

fn convert_tokens_to_string(&self, tokens: Vec<String>) -> String[src]

fn build_input_with_special_tokens( &self, tokens_ids_with_offsets_1: TokenIdsWithOffsets, tokens_ids_with_offsets_2: Option<TokenIdsWithOffsets>) -> TokenIdsWithSpecialTokens[src]

impl Tokenizer<BertVocab> for BertTokenizer[src]

fn vocab(&self) -> &BertVocab[src]

fn tokenize_to_tokens(&self, initial_token: TokenRef<'_>) -> Vec<Token>[src]

fn convert_tokens_to_string(&self, tokens: Vec<String>) -> String[src]

fn build_input_with_special_tokens( &self, tokens_ids_with_offsets_1: TokenIdsWithOffsets, tokens_ids_with_offsets_2: Option<TokenIdsWithOffsets>) -> TokenIdsWithSpecialTokens[src]

impl Tokenizer<Gpt2Vocab> for Gpt2Tokenizer[src]

fn vocab(&self) -> &Gpt2Vocab[src]

fn tokenize_to_tokens(&self, initial_token: TokenRef<'_>) -> Vec<Token>[src]

fn convert_tokens_to_string(&self, tokens: Vec<String>) -> String[src]

impl Tokenizer<MarianVocab> for MarianTokenizer[src]

fn vocab(&self) -> &MarianVocab[src]

fn tokenize_to_tokens(&self, text: TokenRef<'_>) -> Vec<Token>[src]

fn convert_tokens_to_string(&self, tokens: Vec<String>) -> String[src]

fn build_input_with_special_tokens( &self, tokens_ids_with_offsets_1: TokenIdsWithOffsets, tokens_ids_with_offsets_2: Option<TokenIdsWithOffsets>) -> TokenIdsWithSpecialTokens[src]

impl Tokenizer<OpenAiGptVocab> for CtrlTokenizer[src]

fn vocab(&self) -> &OpenAiGptVocab[src]

fn tokenize_to_tokens(&self, initial_token: TokenRef<'_>) -> Vec<Token>[src]

fn convert_tokens_to_string(&self, tokens: Vec<String>) -> String[src]

impl Tokenizer<OpenAiGptVocab> for OpenAiGptTokenizer[src]

fn vocab(&self) -> &OpenAiGptVocab[src]

fn tokenize_to_tokens(&self, initial_token: TokenRef<'_>) -> Vec<Token>[src]

fn convert_tokens_to_string(&self, tokens: Vec<String>) -> String[src]

impl Tokenizer<ReformerVocab> for ReformerTokenizer[src]

fn vocab(&self) -> &ReformerVocab[src]

fn tokenize_to_tokens(&self, text: TokenRef<'_>) -> Vec<Token>[src]

fn convert_tokens_to_string(&self, tokens: Vec<String>) -> String[src]

impl Tokenizer<RobertaVocab> for RobertaTokenizer[src]

fn vocab(&self) -> &RobertaVocab[src]

fn tokenize_to_tokens(&self, initial_token: TokenRef<'_>) -> Vec<Token>[src]

fn convert_tokens_to_string(&self, tokens: Vec<String>) -> String[src]

fn build_input_with_special_tokens( &self, tokens_ids_with_offsets_1: TokenIdsWithOffsets, tokens_ids_with_offsets_2: Option<TokenIdsWithOffsets>) -> TokenIdsWithSpecialTokens[src]

impl Tokenizer<SentencePieceVocab> for SentencePieceTokenizer[src]

fn vocab(&self) -> &SentencePieceVocab[src]

fn tokenize_to_tokens(&self, text: TokenRef<'_>) -> Vec<Token>[src]

fn convert_tokens_to_string(&self, tokens: Vec<String>) -> String[src]

impl Tokenizer<T5Vocab> for T5Tokenizer[src]

fn vocab(&self) -> &T5Vocab[src]

fn tokenize_to_tokens(&self, text: TokenRef<'_>) -> Vec<Token>[src]

fn convert_tokens_to_string(&self, tokens: Vec<String>) -> String[src]

impl Tokenizer<XLMRobertaVocab> for XLMRobertaTokenizer[src]

fn vocab(&self) -> &XLMRobertaVocab[src]

fn tokenize_to_tokens(&self, text: TokenRef<'_>) -> Vec<Token>[src]

fn convert_tokens_to_string(&self, tokens: Vec<String>) -> String[src]

fn build_input_with_special_tokens( &self, tokens_ids_with_offsets_1: TokenIdsWithOffsets, tokens_ids_with_offsets_2: Option<TokenIdsWithOffsets>) -> TokenIdsWithSpecialTokens[src]

impl Tokenizer<XLNetVocab> for XLNetTokenizer[src]

fn vocab(&self) -> &XLNetVocab[src]

fn tokenize_to_tokens(&self, text: TokenRef<'_>) -> Vec<Token>[src]

fn convert_tokens_to_string(&self, tokens: Vec<String>) -> String[src]

fn build_input_with_special_tokens( &self, tokens_ids_with_offsets_1: TokenIdsWithOffsets, tokens_ids_with_offsets_2: Option<TokenIdsWithOffsets>) -> TokenIdsWithSpecialTokens[src]

impl<T: Vocab + Sync + Send> Tokenizer<T> for BaseTokenizer<T>[src]

fn vocab(&self) -> &T[src]

fn tokenize_to_tokens(&self, initial_token: TokenRef<'_>) -> Vec<Token>[src]

`fn vocab(&self) -> &T`

`fn tokenize_to_tokens(&self, text: TokenRef<'_>) -> Vec<Token>`

`fn tokenize<S: AsRef<str>>(&self, text: S) -> Vec<String>`

`fn tokenize_with_offsets<S: AsRef<str>>(&self, text: S) -> TokensWithOffsets`

`fn tokenize_list<S, ST>(&self, text_list: S) -> Vec<Vec<String>> where S: AsRef<[ST]>, ST: AsRef<str>,`

`fn tokenize_list_with_offsets<S, ST>( &self, text_list: S ) -> Vec<TokensWithOffsets> where S: AsRef<[ST]>, ST: AsRef<str>,`

`fn convert_tokens_to_ids<S, ST>(&self, tokens: S) -> Vec<i64> where S: AsRef<[ST]>, ST: AsRef<str>,`

`fn encode<S: AsRef<str>>( &self, text_1: S, text_2: Option<S>, max_len: usize, truncation_strategy: &TruncationStrategy, stride: usize ) -> TokenizedInput`

`fn encode_list<S, ST>( &self, text_list: S, max_len: usize, truncation_strategy: &TruncationStrategy, stride: usize ) -> Vec<TokenizedInput> where S: AsRef<[ST]>, ST: AsRef<str>,`

`fn encode_pair_list<S, ST>( &self, text_list: S, max_len: usize, truncation_strategy: &TruncationStrategy, stride: usize ) -> Vec<TokenizedInput> where S: AsRef<[(ST, ST)]>, ST: AsRef<str>,`

`fn decode_to_vec( &self, token_ids: Vec<i64>, skip_special_tokens: bool ) -> Vec<String>`

`fn decode( &self, token_ids: Vec<i64>, skip_special_tokens: bool, clean_up_tokenization_spaces: bool ) -> String`

`fn convert_tokens_to_string(&self, tokens: Vec<String>) -> String`

`fn clean_up_tokenization(&self, input_string: String) -> String`

`fn decode_list( &self, token_ids_list: Vec<Vec<i64>>, skip_special_tokens: bool, clean_up_tokenization_spaces: bool ) -> Vec<String>`

`fn build_input_with_special_tokens( &self, tokens_ids_with_offsets_1: TokenIdsWithOffsets, tokens_ids_with_offsets_2: Option<TokenIdsWithOffsets> ) -> TokenIdsWithSpecialTokens`

`impl Tokenizer<AlbertVocab> for AlbertTokenizer`[src]

`fn vocab(&self) -> &AlbertVocab`[src]

`fn tokenize_to_tokens(&self, text: TokenRef<'_>) -> Vec<Token>`[src]

`fn convert_tokens_to_string(&self, tokens: Vec<String>) -> String`[src]

`fn build_input_with_special_tokens( &self, tokens_ids_with_offsets_1: TokenIdsWithOffsets, tokens_ids_with_offsets_2: Option<TokenIdsWithOffsets> ) -> TokenIdsWithSpecialTokens`[src]

`impl Tokenizer<BertVocab> for BertTokenizer`[src]

`fn vocab(&self) -> &BertVocab`[src]

`fn tokenize_to_tokens(&self, initial_token: TokenRef<'_>) -> Vec<Token>`[src]

`fn convert_tokens_to_string(&self, tokens: Vec<String>) -> String`[src]

`fn build_input_with_special_tokens( &self, tokens_ids_with_offsets_1: TokenIdsWithOffsets, tokens_ids_with_offsets_2: Option<TokenIdsWithOffsets> ) -> TokenIdsWithSpecialTokens`[src]

`impl Tokenizer<Gpt2Vocab> for Gpt2Tokenizer`[src]

`fn vocab(&self) -> &Gpt2Vocab`[src]

`fn tokenize_to_tokens(&self, initial_token: TokenRef<'_>) -> Vec<Token>`[src]

`fn convert_tokens_to_string(&self, tokens: Vec<String>) -> String`[src]

`impl Tokenizer<MarianVocab> for MarianTokenizer`[src]

`fn vocab(&self) -> &MarianVocab`[src]

`fn tokenize_to_tokens(&self, text: TokenRef<'_>) -> Vec<Token>`[src]

`fn convert_tokens_to_string(&self, tokens: Vec<String>) -> String`[src]

`fn build_input_with_special_tokens( &self, tokens_ids_with_offsets_1: TokenIdsWithOffsets, tokens_ids_with_offsets_2: Option<TokenIdsWithOffsets> ) -> TokenIdsWithSpecialTokens`[src]

`impl Tokenizer<OpenAiGptVocab> for CtrlTokenizer`[src]

`fn vocab(&self) -> &OpenAiGptVocab`[src]

`fn tokenize_to_tokens(&self, initial_token: TokenRef<'_>) -> Vec<Token>`[src]

`fn convert_tokens_to_string(&self, tokens: Vec<String>) -> String`[src]

`impl Tokenizer<OpenAiGptVocab> for OpenAiGptTokenizer`[src]

`fn vocab(&self) -> &OpenAiGptVocab`[src]

`fn tokenize_to_tokens(&self, initial_token: TokenRef<'_>) -> Vec<Token>`[src]

`fn convert_tokens_to_string(&self, tokens: Vec<String>) -> String`[src]

`impl Tokenizer<ReformerVocab> for ReformerTokenizer`[src]

`fn vocab(&self) -> &ReformerVocab`[src]

`fn tokenize_to_tokens(&self, text: TokenRef<'_>) -> Vec<Token>`[src]

`fn convert_tokens_to_string(&self, tokens: Vec<String>) -> String`[src]

`impl Tokenizer<RobertaVocab> for RobertaTokenizer`[src]

`fn vocab(&self) -> &RobertaVocab`[src]

`fn tokenize_to_tokens(&self, initial_token: TokenRef<'_>) -> Vec<Token>`[src]

`fn convert_tokens_to_string(&self, tokens: Vec<String>) -> String`[src]

`fn build_input_with_special_tokens( &self, tokens_ids_with_offsets_1: TokenIdsWithOffsets, tokens_ids_with_offsets_2: Option<TokenIdsWithOffsets> ) -> TokenIdsWithSpecialTokens`[src]

`impl Tokenizer<SentencePieceVocab> for SentencePieceTokenizer`[src]

`fn vocab(&self) -> &SentencePieceVocab`[src]

`fn tokenize_to_tokens(&self, text: TokenRef<'_>) -> Vec<Token>`[src]

`fn convert_tokens_to_string(&self, tokens: Vec<String>) -> String`[src]

`impl Tokenizer<T5Vocab> for T5Tokenizer`[src]

`fn vocab(&self) -> &T5Vocab`[src]

`fn tokenize_to_tokens(&self, text: TokenRef<'_>) -> Vec<Token>`[src]

`fn convert_tokens_to_string(&self, tokens: Vec<String>) -> String`[src]

`impl Tokenizer<XLMRobertaVocab> for XLMRobertaTokenizer`[src]

`fn vocab(&self) -> &XLMRobertaVocab`[src]

`fn tokenize_to_tokens(&self, text: TokenRef<'_>) -> Vec<Token>`[src]

`fn convert_tokens_to_string(&self, tokens: Vec<String>) -> String`[src]

`fn build_input_with_special_tokens( &self, tokens_ids_with_offsets_1: TokenIdsWithOffsets, tokens_ids_with_offsets_2: Option<TokenIdsWithOffsets> ) -> TokenIdsWithSpecialTokens`[src]

`impl Tokenizer<XLNetVocab> for XLNetTokenizer`[src]

`fn vocab(&self) -> &XLNetVocab`[src]

`fn tokenize_to_tokens(&self, text: TokenRef<'_>) -> Vec<Token>`[src]

`fn convert_tokens_to_string(&self, tokens: Vec<String>) -> String`[src]

`fn build_input_with_special_tokens( &self, tokens_ids_with_offsets_1: TokenIdsWithOffsets, tokens_ids_with_offsets_2: Option<TokenIdsWithOffsets> ) -> TokenIdsWithSpecialTokens`[src]

`impl<T: Vocab + Sync + Send> Tokenizer<T> for BaseTokenizer<T>`[src]

`fn vocab(&self) -> &T`[src]

`fn tokenize_to_tokens(&self, initial_token: TokenRef<'_>) -> Vec<Token>`[src]