Struct rust_tokenizers::tokenizer::MBart50Tokenizer[−][src]

pub struct MBart50Tokenizer { /* fields omitted */ }

Expand description

MBart50 tokenizer

MBart50 tokenizer performing:

Splitting on language and special tokens
text cleaning
NFKC decomposition
(optional) lower casing
SentencePiece decomposition

Implementations

[src]

impl MBart50Tokenizer

[src]

pub fn from_file(
path: &str,
lower_case: bool
) -> Result<MBart50Tokenizer, TokenizerError>

Create a new instance of a MBart50Tokenizer Expects a SentencePiece protobuf file as an input.

Parameters

path (&str): path to the SentencePiece model file
lower_case (bool): flag indicating if the text should be lower-cased as part of the tokenization

pub fn from_existing_vocab_and_model(
    vocab: MBart50Vocab,
    model: SentencePieceModel,
    lower_case: bool
) -> MBart50Tokenizer

Create a new instance of a MBart50Tokenizer from an existing vocabulary and model

Parameters

vocab (MBart50Vocab): vocabulary
model (SentencePieceModel): SentencePiece model
lower_case (bool): flag indicating if the text should be lower-cased as part of the tokenization

Example

use rust_tokenizers::tokenizer::{MBart50Tokenizer, Tokenizer};
use rust_tokenizers::vocab::{MBart50Vocab, SentencePieceModel, Vocab};
let lower_case = false;
let vocab = MBart50Vocab::from_file("path/to/vocab/file").unwrap();
let model = SentencePieceModel::from_file("path/to/model/file").unwrap();

let tokenizer = MBart50Tokenizer::from_existing_vocab_and_model(vocab, model, lower_case);

Trait Implementations

[src]

impl MultiThreadedTokenizer<MBart50Vocab> for MBart50Tokenizer

[src]

fn vocab(&self) -> &T

returns a reference to the tokenizer vocabulary

[src]

fn tokenize_list_with_offsets<S>(
 &self,
 text_list: &[S]
) -> Vec<TokensWithOffsets>ⓘ
Notable traits for Vec<u8, A>
`impl<A> Write for Vec<u8, A> where A: Allocator,`
where
 S: AsRef<str> + Sync,

Tokenize a list of strings (with multithreading), where each corresponds to for example a sentence, returns a vector of TokensWithOffsets containing the tokens and their offset information. This calls tokenize_with_offsets on the list provided. Read more

[src]

fn tokenize_list<S>(&self, text_list: &[S]) -> Vec<Vec<String>>ⓘ
Notable traits for Vec<u8, A>
`impl<A> Write for Vec<u8, A> where A: Allocator,`
where
S: AsRef<str> + Sync,

Multithreaded tokenization of a list of strings, returning tokens with offset information Read more

[src]

fn encode_list<S>(
 &self,
 text_list: &[S],
 max_len: usize,
 truncation_strategy: &TruncationStrategy,
 stride: usize
) -> Vec<TokenizedInput>ⓘ
Notable traits for Vec<u8, A>
`impl<A> Write for Vec<u8, A> where A: Allocator,`
where
 S: AsRef<str> + Sync,

Multithreaded encoding of a sequence of string-like texts (tokenization followed by encoding). Not that in contrast with encode optional second text, each text provided is encoded independently. Read more

[src]

fn encode_pair_list<S>(
 &self,
 text_list: &[(S, S)],
 max_len: usize,
 truncation_strategy: &TruncationStrategy,
 stride: usize
) -> Vec<TokenizedInput>ⓘ
Notable traits for Vec<u8, A>
`impl<A> Write for Vec<u8, A> where A: Allocator,`
where
 S: AsRef<str> + Sync,

Multithreaded ncoding of a sequence of string-like text pairs (tokenization followed by encoding). This combines with encode with the list processing of encode_list. Read more

[src]

fn decode_list(
 &self,
 token_ids_list: &[Vec<i64>],
 skip_special_tokens: bool,
 clean_up_tokenization_spaces: bool
) -> Vec<String>ⓘ
Notable traits for Vec<u8, A>
`impl<A> Write for Vec<u8, A> where A: Allocator,`

Multithreaded conversion a list of sequence of ids (integer) into a string, using the tokenizer and vocabulary with options to remove special tokens and clean up tokenization spaces. This calls decode for each provided sequence of ids Read more

[src]

impl Tokenizer<MBart50Vocab> for MBart50Tokenizer

[src]

fn vocab(&self) -> &MBart50Vocab

returns a reference to the tokenizer vocabulary

[src]

fn tokenize_to_tokens(&self, text: TokenRef<'_>) -> Vec<Token>ⓘ
Notable traits for Vec<u8, A>
`impl<A> Write for Vec<u8, A> where A: Allocator,`

Tokenize a TokenRef, returning a sequence of tokens Read more

[src]

fn convert_tokens_to_string(&self, tokens: Vec<String>) -> String

Converts a sequence of strings into a single string. This will clean-up artifacts from tokenization (for example sub ##word) and generate a single output string Read more

[src]

fn build_input_with_special_tokens(
 &self,
 tokens_ids_with_offsets_1: TokenIdsWithOffsets,
 tokens_ids_with_offsets_2: Option<TokenIdsWithOffsets>
) -> TokenIdsWithSpecialTokens

Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and adding special tokens. Read more

[src]

fn tokenize(&self, text: &str) -> Vec<String>ⓘ
Notable traits for Vec<u8, A>
`impl<A> Write for Vec<u8, A> where A: Allocator,`

Tokenize a string, returns a vector of tokens as strings. Use tokenize_with_offsets or tokenize_to_tokens to return offset information. Read more

[src]

fn tokenize_with_offsets(&self, text: &str) -> TokensWithOffsets

Tokenize a string, returning tokens with offset information Read more

[src]

fn tokenize_list<S>(&self, text_list: &[S]) -> Vec<Vec<String>>ⓘ
Notable traits for Vec<u8, A>
`impl<A> Write for Vec<u8, A> where A: Allocator,`
where
S: AsRef<str>,

Tokenize a list of strings, returning tokens with offset information Read more

[src]

fn tokenize_list_with_offsets<S>(
 &self,
 text_list: &[S]
) -> Vec<TokensWithOffsets>ⓘ
Notable traits for Vec<u8, A>
`impl<A> Write for Vec<u8, A> where A: Allocator,`
where
 S: AsRef<str>,

Tokenize a list of strings, where each corresponds to for example a sentence, returns a vector of TokensWithOffsets containing the tokens and their offset information. This calls tokenize_with_offsets on the list provided. Read more

[src]

fn convert_tokens_to_ids<S>(&self, tokens: &[S]) -> Vec<i64>ⓘ
Notable traits for Vec<u8, A>
`impl<A> Write for Vec<u8, A> where A: Allocator,`
where
S: AsRef<str>,

Convert a slice of string-like to a vector ot token indices Read more

[src]

fn encode(
 &self,
 text_1: &str,
 text_2: Option<&str>,
 max_len: usize,
 truncation_strategy: &TruncationStrategy,
 stride: usize
) -> TokenizedInput

Encode a string-like (tokenization followed by encoding) Read more

[src]

fn encode_list<S>(
 &self,
 text_list: &[S],
 max_len: usize,
 truncation_strategy: &TruncationStrategy,
 stride: usize
) -> Vec<TokenizedInput>ⓘ
Notable traits for Vec<u8, A>
`impl<A> Write for Vec<u8, A> where A: Allocator,`
where
 S: AsRef<str>,

Encode a sequence of string-like texts (tokenization followed by encoding). Not that in contrast with encode optional second text, each text provided is encoded independently. Read more

[src]

fn encode_pair_list<S>(
 &self,
 text_list: &[(S, S)],
 max_len: usize,
 truncation_strategy: &TruncationStrategy,
 stride: usize
) -> Vec<TokenizedInput>ⓘ
Notable traits for Vec<u8, A>
`impl<A> Write for Vec<u8, A> where A: Allocator,`
where
 S: AsRef<str>,

Encode a sequence of string-like text pairs (tokenization followed by encoding). This combines with encode with the list processing of encode_list. Read more

[src]

fn decode_to_vec(
 &self,
 token_ids: &[i64 ],
 skip_special_tokens: bool
) -> Vec<String>ⓘ
Notable traits for Vec<u8, A>
`impl<A> Write for Vec<u8, A> where A: Allocator,`

Decode a sequence of token indices to a sequence of Strings, optionally skipping special indices Read more

[src]

fn decode(
    &self,
    token_ids: &[i64 ],
    skip_special_tokens: bool,
    clean_up_tokenization_spaces: bool
) -> String

Converts a sequence of ids (integer) into a string, using the tokenizer and vocabulary with options to remove special tokens and clean up tokenization spaces. Read more

[src]

fn clean_up_tokenization(&self, input_string: String) -> String

Cleans-up tokenization artifacts (for example whitespace before punctuation) Read more

[src]

fn decode_list(
 &self,
 token_ids_list: &[Vec<i64>],
 skip_special_tokens: bool,
 clean_up_tokenization_spaces: bool
) -> Vec<String>ⓘ
Notable traits for Vec<u8, A>
`impl<A> Write for Vec<u8, A> where A: Allocator,`

Converts a list of sequence of ids (integer) into a string, using the tokenizer and vocabulary with options to remove special tokens and clean up tokenization spaces. This calls decode for each provided sequence of ids Read more

Auto Trait Implementations

impl RefUnwindSafe for MBart50Tokenizer

impl Send for MBart50Tokenizer

impl Sync for MBart50Tokenizer

impl Unpin for MBart50Tokenizer

impl UnwindSafe for MBart50Tokenizer

Blanket Implementations

[src]

impl<T> Any for T where
T: 'static + ?Sized,

[src]

pub fn type_id(&self) -> TypeId

Gets the TypeId of self. Read more

[src]

impl<T> Borrow<T> for T where
T: ?Sized,

[src]

pub fn borrow(&self) -> &T

Immutably borrows from an owned value. Read more

[src]

impl<T> BorrowMut<T> for T where
T: ?Sized,

[src]

pub fn borrow_mut(&mut self) -> &mut T

Mutably borrows from an owned value. Read more

[src]

impl<T> From<T> for T

[src]

pub fn from(t: T) -> T

Performs the conversion.

[src]

impl<T, U> Into for T where
U: From<T>,

[src]

pub fn into(self) -> U

Performs the conversion.

impl<T> Pointable for T

pub const ALIGN: usize

The alignment of pointer.

type Init = T

The type for initializers.

pub unsafe fn init(init: <T as Pointable>::Init) -> usize

Initializes a with the given initializer. Read more

pub unsafe fn deref<'a>(ptr: usize) -> &'a T

Dereferences the given pointer. Read more

pub unsafe fn deref_mut<'a>(ptr: usize) -> &'a mut T

Mutably dereferences the given pointer. Read more

pub unsafe fn drop(ptr: usize)

Drops the object pointed to by the given pointer. Read more

[src]

impl<T, U> TryFrom for T where
U: Into<T>,

type Error = Infallible

The type returned in the event of a conversion error.

[src]

pub fn try_from(value: U) -> Result<T, <T as TryFrom>::Error>

Performs the conversion.

[src]

impl<T, U> TryInto for T where
U: TryFrom<T>,

type Error = >::Error

The type returned in the event of a conversion error.

[src]

pub fn try_into(self) -> Result<U, >::Error>

Performs the conversion.

Struct rust_tokenizers::tokenizer::MBart50Tokenizer[−][src]

Implementations

impl MBart50Tokenizer

pub fn from_file( path: &str, lower_case: bool) -> Result<MBart50Tokenizer, TokenizerError>

pub fn from_existing_vocab_and_model( vocab: MBart50Vocab, model: SentencePieceModel, lower_case: bool) -> MBart50Tokenizer

Trait Implementations

impl MultiThreadedTokenizer<MBart50Vocab> for MBart50Tokenizer

fn vocab(&self) -> &T

fn tokenize_list_with_offsets<S>( &self, text_list: &[S]) -> Vec<TokensWithOffsets>ⓘNotable traits for Vec<u8, A>impl<A> Write for Vec<u8, A> where A: Allocator, where S: AsRef<str> + Sync,

fn tokenize_list<S>(&self, text_list: &[S]) -> Vec<Vec<String>>ⓘNotable traits for Vec<u8, A>impl<A> Write for Vec<u8, A> where A: Allocator, where S: AsRef<str> + Sync,

fn encode_list<S>( &self, text_list: &[S], max_len: usize, truncation_strategy: &TruncationStrategy, stride: usize) -> Vec<TokenizedInput>ⓘNotable traits for Vec<u8, A>impl<A> Write for Vec<u8, A> where A: Allocator, where S: AsRef<str> + Sync,

fn encode_pair_list<S>( &self, text_list: &[(S, S)], max_len: usize, truncation_strategy: &TruncationStrategy, stride: usize) -> Vec<TokenizedInput>ⓘNotable traits for Vec<u8, A>impl<A> Write for Vec<u8, A> where A: Allocator, where S: AsRef<str> + Sync,

fn decode_list( &self, token_ids_list: &[Vec<i64>], skip_special_tokens: bool, clean_up_tokenization_spaces: bool) -> Vec<String>ⓘNotable traits for Vec<u8, A>impl<A> Write for Vec<u8, A> where A: Allocator,

impl Tokenizer<MBart50Vocab> for MBart50Tokenizer

fn vocab(&self) -> &MBart50Vocab

fn tokenize_to_tokens(&self, text: TokenRef<'_>) -> Vec<Token>ⓘNotable traits for Vec<u8, A>impl<A> Write for Vec<u8, A> where A: Allocator,

fn convert_tokens_to_string(&self, tokens: Vec<String>) -> String

fn build_input_with_special_tokens( &self, tokens_ids_with_offsets_1: TokenIdsWithOffsets, tokens_ids_with_offsets_2: Option<TokenIdsWithOffsets>) -> TokenIdsWithSpecialTokens

fn tokenize(&self, text: &str) -> Vec<String>ⓘNotable traits for Vec<u8, A>impl<A> Write for Vec<u8, A> where A: Allocator,

fn tokenize_with_offsets(&self, text: &str) -> TokensWithOffsets

fn tokenize_list<S>(&self, text_list: &[S]) -> Vec<Vec<String>>ⓘNotable traits for Vec<u8, A>impl<A> Write for Vec<u8, A> where A: Allocator, where S: AsRef<str>,

fn tokenize_list_with_offsets<S>( &self, text_list: &[S]) -> Vec<TokensWithOffsets>ⓘNotable traits for Vec<u8, A>impl<A> Write for Vec<u8, A> where A: Allocator, where S: AsRef<str>,

fn convert_tokens_to_ids<S>(&self, tokens: &[S]) -> Vec<i64>ⓘNotable traits for Vec<u8, A>impl<A> Write for Vec<u8, A> where A: Allocator, where S: AsRef<str>,

fn encode( &self, text_1: &str, text_2: Option<&str>, max_len: usize, truncation_strategy: &TruncationStrategy, stride: usize) -> TokenizedInput

fn encode_list<S>( &self, text_list: &[S], max_len: usize, truncation_strategy: &TruncationStrategy, stride: usize) -> Vec<TokenizedInput>ⓘNotable traits for Vec<u8, A>impl<A> Write for Vec<u8, A> where A: Allocator, where S: AsRef<str>,

fn encode_pair_list<S>( &self, text_list: &[(S, S)], max_len: usize, truncation_strategy: &TruncationStrategy, stride: usize) -> Vec<TokenizedInput>ⓘNotable traits for Vec<u8, A>impl<A> Write for Vec<u8, A> where A: Allocator, where S: AsRef<str>,

fn decode_to_vec( &self, token_ids: &[i64], skip_special_tokens: bool) -> Vec<String>ⓘNotable traits for Vec<u8, A>impl<A> Write for Vec<u8, A> where A: Allocator,

fn decode( &self, token_ids: &[i64], skip_special_tokens: bool, clean_up_tokenization_spaces: bool) -> String

fn clean_up_tokenization(&self, input_string: String) -> String

fn decode_list( &self, token_ids_list: &[Vec<i64>], skip_special_tokens: bool, clean_up_tokenization_spaces: bool) -> Vec<String>ⓘNotable traits for Vec<u8, A>impl<A> Write for Vec<u8, A> where A: Allocator,

Auto Trait Implementations

impl RefUnwindSafe for MBart50Tokenizer

impl Send for MBart50Tokenizer

impl Sync for MBart50Tokenizer

impl Unpin for MBart50Tokenizer

impl UnwindSafe for MBart50Tokenizer

Blanket Implementations

impl<T> Any for T where T: 'static + ?Sized,

pub fn type_id(&self) -> TypeId

impl<T> Borrow<T> for T where T: ?Sized,

pub fn borrow(&self) -> &T

impl<T> BorrowMut<T> for T where T: ?Sized,

pub fn borrow_mut(&mut self) -> &mut T

impl<T> From<T> for T

pub fn from(t: T) -> T

impl<T, U> Into<U> for T where U: From<T>,

pub fn into(self) -> U

impl<T> Pointable for T

pub const ALIGN: usize

type Init = T

pub unsafe fn init(init: <T as Pointable>::Init) -> usize

pub unsafe fn deref<'a>(ptr: usize) -> &'a T

pub unsafe fn deref_mut<'a>(ptr: usize) -> &'a mut T

pub unsafe fn drop(ptr: usize)

impl<T, U> TryFrom<U> for T where U: Into<T>,

type Error = Infallible

pub fn try_from(value: U) -> Result<T, <T as TryFrom<U>>::Error>

impl<T, U> TryInto<U> for T where U: TryFrom<T>,

type Error = <U as TryFrom<T>>::Error

pub fn try_into(self) -> Result<U, <U as TryFrom<T>>::Error>

pub fn from_file(
path: &str,
lower_case: bool
) -> Result<MBart50Tokenizer, TokenizerError>

pub fn from_existing_vocab_and_model(
vocab: MBart50Vocab,
model: SentencePieceModel,
lower_case: bool
) -> MBart50Tokenizer

fn tokenize_list_with_offsets<S>(
&self,
text_list: &[S]
) -> Vec<TokensWithOffsets>ⓘ
Notable traits for Vec<u8, A>
`impl<A> Write for Vec<u8, A> where A: Allocator,`
where
S: AsRef<str> + Sync,

fn tokenize_list<S>(&self, text_list: &[S]) -> Vec<Vec<String>>ⓘ
Notable traits for Vec<u8, A>
`impl<A> Write for Vec<u8, A> where A: Allocator,`
where
S: AsRef<str> + Sync,

fn encode_list<S>(
&self,
text_list: &[S],
max_len: usize,
truncation_strategy: &TruncationStrategy,
stride: usize
) -> Vec<TokenizedInput>ⓘ
Notable traits for Vec<u8, A>
`impl<A> Write for Vec<u8, A> where A: Allocator,`
where
S: AsRef<str> + Sync,

fn encode_pair_list<S>(
&self,
text_list: &[(S, S)],
max_len: usize,
truncation_strategy: &TruncationStrategy,
stride: usize
) -> Vec<TokenizedInput>ⓘ
Notable traits for Vec<u8, A>
`impl<A> Write for Vec<u8, A> where A: Allocator,`
where
S: AsRef<str> + Sync,

fn decode_list(
&self,
token_ids_list: &[Vec<i64>],
skip_special_tokens: bool,
clean_up_tokenization_spaces: bool
) -> Vec<String>ⓘ
Notable traits for Vec<u8, A>
`impl<A> Write for Vec<u8, A> where A: Allocator,`

fn tokenize_to_tokens(&self, text: TokenRef<'_>) -> Vec<Token>ⓘ
Notable traits for Vec<u8, A>
`impl<A> Write for Vec<u8, A> where A: Allocator,`

fn build_input_with_special_tokens(
&self,
tokens_ids_with_offsets_1: TokenIdsWithOffsets,
tokens_ids_with_offsets_2: Option<TokenIdsWithOffsets>
) -> TokenIdsWithSpecialTokens

fn tokenize(&self, text: &str) -> Vec<String>ⓘ
Notable traits for Vec<u8, A>
`impl<A> Write for Vec<u8, A> where A: Allocator,`

fn tokenize_list<S>(&self, text_list: &[S]) -> Vec<Vec<String>>ⓘ
Notable traits for Vec<u8, A>
`impl<A> Write for Vec<u8, A> where A: Allocator,`
where
S: AsRef<str>,

fn tokenize_list_with_offsets<S>(
&self,
text_list: &[S]
) -> Vec<TokensWithOffsets>ⓘ
Notable traits for Vec<u8, A>
`impl<A> Write for Vec<u8, A> where A: Allocator,`
where
S: AsRef<str>,

fn convert_tokens_to_ids<S>(&self, tokens: &[S]) -> Vec<i64>ⓘ
Notable traits for Vec<u8, A>
`impl<A> Write for Vec<u8, A> where A: Allocator,`
where
S: AsRef<str>,

fn encode(
&self,
text_1: &str,
text_2: Option<&str>,
max_len: usize,
truncation_strategy: &TruncationStrategy,
stride: usize
) -> TokenizedInput

fn encode_list<S>(
&self,
text_list: &[S],
max_len: usize,
truncation_strategy: &TruncationStrategy,
stride: usize
) -> Vec<TokenizedInput>ⓘ
Notable traits for Vec<u8, A>
`impl<A> Write for Vec<u8, A> where A: Allocator,`
where
S: AsRef<str>,

fn encode_pair_list<S>(
&self,
text_list: &[(S, S)],
max_len: usize,
truncation_strategy: &TruncationStrategy,
stride: usize
) -> Vec<TokenizedInput>ⓘ
Notable traits for Vec<u8, A>
`impl<A> Write for Vec<u8, A> where A: Allocator,`
where
S: AsRef<str>,

fn decode_to_vec(
&self,
token_ids: &[i64 ],
skip_special_tokens: bool
) -> Vec<String>ⓘ
Notable traits for Vec<u8, A>
`impl<A> Write for Vec<u8, A> where A: Allocator,`

fn decode(
&self,
token_ids: &[i64 ],
skip_special_tokens: bool,
clean_up_tokenization_spaces: bool
) -> String

fn decode_list(
&self,
token_ids_list: &[Vec<i64>],
skip_special_tokens: bool,
clean_up_tokenization_spaces: bool
) -> Vec<String>ⓘ
Notable traits for Vec<u8, A>
`impl<A> Write for Vec<u8, A> where A: Allocator,`

impl<T> Any for T where
T: 'static + ?Sized,

impl<T> Borrow<T> for T where
T: ?Sized,

impl<T> BorrowMut<T> for T where
T: ?Sized,

impl<T, U> Into<U> for T where
U: From<T>,

impl<T, U> TryFrom<U> for T where
U: Into<T>,

impl<T, U> TryInto<U> for T where
U: TryFrom<T>,