Struct rust_tokenizers::tokenizer::MBart50Tokenizer[−][src]

pub struct MBart50Tokenizer { /* fields omitted */ }

Expand description

MBart50 tokenizer

MBart50 tokenizer performing:

Splitting on special tokens
text cleaning
NFKC decomposition
(optional) lower casing
SentencePiece decomposition

Implementations

impl MBart50Tokenizer[src]

pub fn from_file(
    path: &str, 
    lower_case: bool
) -> Result<MBart50Tokenizer, TokenizerError>

[src]

Create a new instance of a MBart50Tokenizer Expects a json vocab file and a SentencePiece protobuf file as an input.

Parameters

path (&str): path to the SentencePiece model file
lower_case (bool): flag indicating if the text should be lower-cased as part of the tokenization

Example

use rust_tokenizers::tokenizer::{Tokenizer, MBart50Tokenizer};
let lower_case = false;
let tokenizer = MBart50Tokenizer::from_file("path/to/vocab/file", lower_case).unwrap();

pub fn from_existing_vocab_and_model(
    vocab: MBart50Vocab, 
    model: SentencePieceModel, 
    lower_case: bool
) -> MBart50Tokenizer

[src]

Create a new instance of a MBart50Tokenizer from an existing vocabulary and model

Parameters

vocab (MBart50Vocab): vocabulary
model (SentencePieceModel): SentencePiece model
lower_case (bool): flag indicating if the text should be lower-cased as part of the tokenization

Example

use rust_tokenizers::tokenizer::{Tokenizer, MBart50Tokenizer};
use rust_tokenizers::vocab::{SentencePieceModel, Vocab, MBart50Vocab};
let lower_case = false;
let vocab = MBart50Vocab::from_file("path/to/vocab/file").unwrap();
let model = SentencePieceModel::from_file("path/to/model/file").unwrap();

let tokenizer = MBart50Tokenizer::from_existing_vocab_and_model(vocab, model, lower_case);

Trait Implementations

impl MultiThreadedTokenizer<MBart50Vocab> for MBart50Tokenizer[src]

fn vocab(&self) -> &T[src]

returns a reference to the tokenizer vocabulary

fn tokenize_list_with_offsets<S, ST>(
    &self, 
    text_list: S
) -> Vec<TokensWithOffsets> where
    S: AsRef<[ST]>,
    ST: AsRef<str> + Sync,

[src]

Tokenize a list of strings (with multithreading), where each corresponds to for example a sentence, returns a vector of TokensWithOffsets containing the tokens and their offset information. This calls tokenize_with_offsets on the list provided. Read more

fn tokenize_list<S, ST>(&self, text_list: S) -> Vec<Vec<String>> where
    S: AsRef<[ST]>,
    ST: AsRef<str> + Sync,

[src]

Multithreaded tokenization of a list of strings, returning tokens with offset information Read more

fn encode_list<S, ST>(
    &self, 
    text_list: S, 
    max_len: usize, 
    truncation_strategy: &TruncationStrategy, 
    stride: usize
) -> Vec<TokenizedInput> where
    S: AsRef<[ST]>,
    ST: AsRef<str> + Sync,

[src]

Multithreaded encoding of a sequence of string-like texts (tokenization followed by encoding). Not that in contrast with encode optional second text, each text provided is encoded independently. Read more

fn encode_pair_list<S, ST>(
    &self, 
    text_list: S, 
    max_len: usize, 
    truncation_strategy: &TruncationStrategy, 
    stride: usize
) -> Vec<TokenizedInput> where
    S: AsRef<[(ST, ST)]>,
    ST: AsRef<str> + Sync,

[src]

Multithreaded ncoding of a sequence of string-like text pairs (tokenization followed by encoding). This combines with encode with the list processing of encode_list. Read more

fn decode_list(
    &self, 
    token_ids_list: Vec<Vec<i64>>, 
    skip_special_tokens: bool, 
    clean_up_tokenization_spaces: bool
) -> Vec<String>

[src]

Multithreaded conversion a list of sequence of ids (integer) into a string, using the tokenizer and vocabulary with options to remove special tokens and clean up tokenization spaces. This calls decode for each provided sequence of ids Read more

impl Tokenizer<MBart50Vocab> for MBart50Tokenizer[src]

fn vocab(&self) -> &MBart50Vocab[src]

returns a reference to the tokenizer vocabulary

fn tokenize_to_tokens(&self, text: TokenRef<'_>) -> Vec<Token>[src]

Tokenize a TokenRef, returning a sequence of tokens Read more

fn convert_tokens_to_string(&self, tokens: Vec<String>) -> String[src]

Converts a sequence of strings into a single string. This will clean-up artifacts from tokenization (for example sub ##word) and generate a single output string Read more

fn build_input_with_special_tokens(
    &self, 
    tokens_ids_with_offsets_1: TokenIdsWithOffsets, 
    tokens_ids_with_offsets_2: Option<TokenIdsWithOffsets>
) -> TokenIdsWithSpecialTokens

[src]

Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and adding special tokens. Read more

fn tokenize<S: AsRef<str>>(&self, text: S) -> Vec<String>[src]

Tokenize a string, returns a vector of tokens as strings. Use tokenize_with_offsets or tokenize_to_tokens to return offset information. Read more

fn tokenize_with_offsets<S: AsRef<str>>(&self, text: S) -> TokensWithOffsets[src]

Tokenize a string, returning tokens with offset information Read more

fn tokenize_list<S, ST>(&self, text_list: S) -> Vec<Vec<String>> where
    S: AsRef<[ST]>,
    ST: AsRef<str>,

[src]

Tokenize a list of strings, returning tokens with offset information Read more

fn tokenize_list_with_offsets<S, ST>(
    &self, 
    text_list: S
) -> Vec<TokensWithOffsets> where
    S: AsRef<[ST]>,
    ST: AsRef<str>,

[src]

Tokenize a list of strings, where each corresponds to for example a sentence, returns a vector of TokensWithOffsets containing the tokens and their offset information. This calls tokenize_with_offsets on the list provided. Read more

fn convert_tokens_to_ids<S, ST>(&self, tokens: S) -> Vec<i64> where
    S: AsRef<[ST]>,
    ST: AsRef<str>,

[src]

Convert a slice of string-like to a vector ot token indices Read more

fn encode<S: AsRef<str>>(
    &self, 
    text_1: S, 
    text_2: Option<S>, 
    max_len: usize, 
    truncation_strategy: &TruncationStrategy, 
    stride: usize
) -> TokenizedInput

[src]

Encode a string-like (tokenization followed by encoding) Read more

fn encode_list<S, ST>(
    &self, 
    text_list: S, 
    max_len: usize, 
    truncation_strategy: &TruncationStrategy, 
    stride: usize
) -> Vec<TokenizedInput> where
    S: AsRef<[ST]>,
    ST: AsRef<str>,

[src]

Encode a sequence of string-like texts (tokenization followed by encoding). Not that in contrast with encode optional second text, each text provided is encoded independently. Read more

fn encode_pair_list<S, ST>(
    &self, 
    text_list: S, 
    max_len: usize, 
    truncation_strategy: &TruncationStrategy, 
    stride: usize
) -> Vec<TokenizedInput> where
    S: AsRef<[(ST, ST)]>,
    ST: AsRef<str>,

[src]

Encode a sequence of string-like text pairs (tokenization followed by encoding). This combines with encode with the list processing of encode_list. Read more

fn decode_to_vec(
    &self, 
    token_ids: Vec<i64>, 
    skip_special_tokens: bool
) -> Vec<String>

[src]

Decode a sequence of token indices to a sequence of Strings, optionally skipping special indices Read more

fn decode(
    &self, 
    token_ids: Vec<i64>, 
    skip_special_tokens: bool, 
    clean_up_tokenization_spaces: bool
) -> String

[src]

Converts a sequence of ids (integer) into a string, using the tokenizer and vocabulary with options to remove special tokens and clean up tokenization spaces. Read more

fn clean_up_tokenization(&self, input_string: String) -> String[src]

Cleans-up tokenization artifacts (for example whitespace before punctuation) Read more

fn decode_list(
    &self, 
    token_ids_list: Vec<Vec<i64>>, 
    skip_special_tokens: bool, 
    clean_up_tokenization_spaces: bool
) -> Vec<String>

[src]

Converts a list of sequence of ids (integer) into a string, using the tokenizer and vocabulary with options to remove special tokens and clean up tokenization spaces. This calls decode for each provided sequence of ids Read more

Auto Trait Implementations

impl RefUnwindSafe for MBart50Tokenizer

impl Send for MBart50Tokenizer

impl Sync for MBart50Tokenizer

impl Unpin for MBart50Tokenizer

impl UnwindSafe for MBart50Tokenizer

Blanket Implementations

impl<T> Any for T where
    T: 'static + ?Sized,

[src]

pub fn type_id(&self) -> TypeId[src]

Gets the TypeId of self. Read more

impl<T> Borrow<T> for T where
    T: ?Sized,

[src]

pub fn borrow(&self) -> &T[src]

Immutably borrows from an owned value. Read more

impl<T> BorrowMut<T> for T where
    T: ?Sized,

[src]

pub fn borrow_mut(&mut self) -> &mut T[src]

Mutably borrows from an owned value. Read more

impl<T> From<T> for T[src]

pub fn from(t: T) -> T[src]

Performs the conversion.

impl<T, U> Into<U> for T where
    U: From<T>,

[src]

pub fn into(self) -> U[src]

Performs the conversion.

impl<T> Pointable for T

pub const ALIGN: usize

The alignment of pointer.

type Init = T

The type for initializers.

pub unsafe fn init(init: <T as Pointable>::Init) -> usize

Initializes a with the given initializer. Read more

pub unsafe fn deref<'a>(ptr: usize) -> &'a T

Dereferences the given pointer. Read more

pub unsafe fn deref_mut<'a>(ptr: usize) -> &'a mut T

Mutably dereferences the given pointer. Read more

pub unsafe fn drop(ptr: usize)

Drops the object pointed to by the given pointer. Read more

impl<T, U> TryFrom<U> for T where
    U: Into<T>,

[src]

type Error = Infallible

The type returned in the event of a conversion error.

pub fn try_from(value: U) -> Result<T, <T as TryFrom<U>>::Error>[src]

Performs the conversion.

impl<T, U> TryInto<U> for T where
    U: TryFrom<T>,

[src]

type Error = <U as TryFrom<T>>::Error

The type returned in the event of a conversion error.

pub fn try_into(self) -> Result<U, <U as TryFrom<T>>::Error>[src]

Performs the conversion.