Struct rust_tokenizers::vocab::MarianVocab[−][src]

pub struct MarianVocab {
    pub values: HashMap<String, i64>,
    pub indices: HashMap<i64, String>,
    pub unknown_value: &'static str,
    pub special_values: HashMap<String, i64>,
    pub special_indices: HashMap<i64, String>,
}

Expand description

Marian Vocab

Vocabulary for Marian tokenizer. Contains the following special values:

PAD token
EOS token

Expects a JSON-format vocabulary when created from file.

Fields

values: HashMap<String, i64>

A mapping of tokens as string to indices (i.e. the encoder base)

indices: HashMap<i64, String>

A mapping of token ids to strings (i.e. the decoder base)

unknown_value: &'static str

The string to use for unknown (out of vocabulary) tokens

special_values: HashMap<String, i64>

A mapping of special value tokens as strings to IDs (i.e. the encoder base for special values), special values typically include things like BOS/EOS markers, class markers, mask markers and padding markers

special_indices: HashMap<i64, String>

A mapping of special value tokens as IDs to strings (i.e. the decoder base for special values)

Implementations

impl MarianVocab[src]

pub fn pad_value() -> &'static str[src]

Returns the PAD token for Marian (<pad>)

pub fn eos_value() -> &'static str[src]

Returns the EOS token for Marian (</s>)

Trait Implementations

impl Clone for MarianVocab[src]

fn clone(&self) -> MarianVocab[src]

Returns a copy of the value. Read more

fn clone_from(&mut self, source: &Self)1.0.0 [src]

Performs copy-assignment from source. Read more

impl Debug for MarianVocab[src]

fn fmt(&self, f: &mut Formatter<'_>) -> Result[src]

Formats the value using the given formatter. Read more

impl MultiThreadedTokenizer<MarianVocab> for MarianTokenizer[src]

fn vocab(&self) -> &T[src]

returns a reference to the tokenizer vocabulary

fn tokenize_list_with_offsets<S, ST>(
    &self, 
    text_list: S
) -> Vec<TokensWithOffsets> where
    S: AsRef<[ST]>,
    ST: AsRef<str> + Sync,

[src]

Tokenize a list of strings (with multithreading), where each corresponds to for example a sentence, returns a vector of TokensWithOffsets containing the tokens and their offset information. This calls tokenize_with_offsets on the list provided. Read more

fn tokenize_list<S, ST>(&self, text_list: S) -> Vec<Vec<String>> where
    S: AsRef<[ST]>,
    ST: AsRef<str> + Sync,

[src]

Multithreaded tokenization of a list of strings, returning tokens with offset information Read more

fn encode_list<S, ST>(
    &self, 
    text_list: S, 
    max_len: usize, 
    truncation_strategy: &TruncationStrategy, 
    stride: usize
) -> Vec<TokenizedInput> where
    S: AsRef<[ST]>,
    ST: AsRef<str> + Sync,

[src]

Multithreaded encoding of a sequence of string-like texts (tokenization followed by encoding). Not that in contrast with encode optional second text, each text provided is encoded independently. Read more

fn encode_pair_list<S, ST>(
    &self, 
    text_list: S, 
    max_len: usize, 
    truncation_strategy: &TruncationStrategy, 
    stride: usize
) -> Vec<TokenizedInput> where
    S: AsRef<[(ST, ST)]>,
    ST: AsRef<str> + Sync,

[src]

Multithreaded ncoding of a sequence of string-like text pairs (tokenization followed by encoding). This combines with encode with the list processing of encode_list. Read more

fn decode_list(
    &self, 
    token_ids_list: Vec<Vec<i64>>, 
    skip_special_tokens: bool, 
    clean_up_tokenization_spaces: bool
) -> Vec<String>

[src]

Multithreaded conversion a list of sequence of ids (integer) into a string, using the tokenizer and vocabulary with options to remove special tokens and clean up tokenization spaces. This calls decode for each provided sequence of ids Read more

impl Tokenizer<MarianVocab> for MarianTokenizer[src]

fn vocab(&self) -> &MarianVocab[src]

returns a reference to the tokenizer vocabulary

fn tokenize_to_tokens(&self, text: TokenRef<'_>) -> Vec<Token>[src]

Tokenize a TokenRef, returning a sequence of tokens Read more

fn convert_tokens_to_string(&self, tokens: Vec<String>) -> String[src]

Converts a sequence of strings into a single string. This will clean-up artifacts from tokenization (for example sub ##word) and generate a single output string Read more

fn build_input_with_special_tokens(
    &self, 
    tokens_ids_with_offsets_1: TokenIdsWithOffsets, 
    tokens_ids_with_offsets_2: Option<TokenIdsWithOffsets>
) -> TokenIdsWithSpecialTokens

[src]

Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and adding special tokens. Read more

fn tokenize<S: AsRef<str>>(&self, text: S) -> Vec<String>[src]

Tokenize a string, returns a vector of tokens as strings. Use tokenize_with_offsets or tokenize_to_tokens to return offset information. Read more

fn tokenize_with_offsets<S: AsRef<str>>(&self, text: S) -> TokensWithOffsets[src]

Tokenize a string, returning tokens with offset information Read more

fn tokenize_list<S, ST>(&self, text_list: S) -> Vec<Vec<String>> where
    S: AsRef<[ST]>,
    ST: AsRef<str>,

[src]

Tokenize a list of strings, returning tokens with offset information Read more

fn tokenize_list_with_offsets<S, ST>(
    &self, 
    text_list: S
) -> Vec<TokensWithOffsets> where
    S: AsRef<[ST]>,
    ST: AsRef<str>,

[src]

Tokenize a list of strings, where each corresponds to for example a sentence, returns a vector of TokensWithOffsets containing the tokens and their offset information. This calls tokenize_with_offsets on the list provided. Read more

fn convert_tokens_to_ids<S, ST>(&self, tokens: S) -> Vec<i64> where
    S: AsRef<[ST]>,
    ST: AsRef<str>,

[src]

Convert a slice of string-like to a vector ot token indices Read more

fn encode<S: AsRef<str>>(
    &self, 
    text_1: S, 
    text_2: Option<S>, 
    max_len: usize, 
    truncation_strategy: &TruncationStrategy, 
    stride: usize
) -> TokenizedInput

[src]

Encode a string-like (tokenization followed by encoding) Read more

fn encode_list<S, ST>(
    &self, 
    text_list: S, 
    max_len: usize, 
    truncation_strategy: &TruncationStrategy, 
    stride: usize
) -> Vec<TokenizedInput> where
    S: AsRef<[ST]>,
    ST: AsRef<str>,

[src]

Encode a sequence of string-like texts (tokenization followed by encoding). Not that in contrast with encode optional second text, each text provided is encoded independently. Read more

fn encode_pair_list<S, ST>(
    &self, 
    text_list: S, 
    max_len: usize, 
    truncation_strategy: &TruncationStrategy, 
    stride: usize
) -> Vec<TokenizedInput> where
    S: AsRef<[(ST, ST)]>,
    ST: AsRef<str>,

[src]

Encode a sequence of string-like text pairs (tokenization followed by encoding). This combines with encode with the list processing of encode_list. Read more

fn decode_to_vec(
    &self, 
    token_ids: Vec<i64>, 
    skip_special_tokens: bool
) -> Vec<String>

[src]

Decode a sequence of token indices to a sequence of Strings, optionally skipping special indices Read more

fn decode(
    &self, 
    token_ids: Vec<i64>, 
    skip_special_tokens: bool, 
    clean_up_tokenization_spaces: bool
) -> String

[src]

Converts a sequence of ids (integer) into a string, using the tokenizer and vocabulary with options to remove special tokens and clean up tokenization spaces. Read more

fn clean_up_tokenization(&self, input_string: String) -> String[src]

Cleans-up tokenization artifacts (for example whitespace before punctuation) Read more

fn decode_list(
    &self, 
    token_ids_list: Vec<Vec<i64>>, 
    skip_special_tokens: bool, 
    clean_up_tokenization_spaces: bool
) -> Vec<String>

[src]

Converts a list of sequence of ids (integer) into a string, using the tokenizer and vocabulary with options to remove special tokens and clean up tokenization spaces. This calls decode for each provided sequence of ids Read more

impl Vocab for MarianVocab[src]

fn unknown_value() -> &'static str[src]

Associative function returning the unknown value for the vocabulary

fn get_unknown_value(&self) -> &'static str[src]

Returns the unknown value on an instance

fn values(&self) -> &HashMap<String, i64>[src]

Return the map of token strings to IDs

fn indices(&self) -> &HashMap<i64, String>[src]

Return the map of token IDs to strings

fn special_values(&self) -> &HashMap<String, i64>[src]

Return the map of token strings to IDs

fn special_indices(&self) -> &HashMap<i64, String>[src]

Return the map of token IDs to strings for special values

fn from_file(path: &str) -> Result<MarianVocab, TokenizerError>[src]

Read a vocabulary from file Read more

fn token_to_id(&self, token: &str) -> i64[src]

Converts a token to an id. Read more

fn id_to_token(&self, id: &i64) -> String[src]

Converts an id to a token. Read more

fn read_vocab_file(path: &str) -> Result<HashMap<String, i64>, TokenizerError>[src]

Read a Bert-style vocab.txt file (single column, one token per line) The from_file method should be preferred, and needs to be implemented by the specific vocabularies Read more

fn _token_to_id(
    &self, 
    token: &str, 
    values: &HashMap<String, i64>, 
    special_values: &HashMap<String, i64>, 
    unknown_value: &str
) -> i64

[src]

Converts a token to an id, provided a HashMap of values, a HashMap of special values and the unknown value token string representation. This is not meant to be directly used, the method token_to_id offers a more convenient interface for most vocabularies, but needs to be implemented by the specific vocabulary. Read more

fn _id_to_token(
    &self, 
    id: &i64, 
    indices: &HashMap<i64, String>, 
    special_indices: &HashMap<i64, String>, 
    unknown_value: &str
) -> String

[src]

Converts an id to a token, provided a HashMap of values, a HashMap of special values and the unknown value token string representation. This is not meant to be directly used, the method id_to_token offers a more convenient interface for most vocabularies, but needs to be implemented by the specific vocabulary. Read more

fn _register_as_special_value(
    token: &str, 
    values: &HashMap<String, i64>, 
    special_values: &mut HashMap<String, i64>
) -> Result<(), TokenizerError>

[src]

Register a token as a special value Read more

fn convert_tokens_to_ids(&self, tokens: &[&str]) -> Vec<i64>[src]

Converts a list of tokens to a list of indices. Read more

Auto Trait Implementations

impl RefUnwindSafe for MarianVocab

impl Send for MarianVocab

impl Sync for MarianVocab

impl Unpin for MarianVocab

impl UnwindSafe for MarianVocab

Blanket Implementations

impl<T> Any for T where
    T: 'static + ?Sized,

[src]

pub fn type_id(&self) -> TypeId[src]

Gets the TypeId of self. Read more

impl<T> Borrow<T> for T where
    T: ?Sized,

[src]

pub fn borrow(&self) -> &T[src]

Immutably borrows from an owned value. Read more

impl<T> BorrowMut<T> for T where
    T: ?Sized,

[src]

pub fn borrow_mut(&mut self) -> &mut T[src]

Mutably borrows from an owned value. Read more

impl<T> From<T> for T[src]

pub fn from(t: T) -> T[src]

Performs the conversion.

impl<T, U> Into<U> for T where
    U: From<T>,

[src]

pub fn into(self) -> U[src]

Performs the conversion.

impl<T> Pointable for T

pub const ALIGN: usize

The alignment of pointer.

type Init = T

The type for initializers.

pub unsafe fn init(init: <T as Pointable>::Init) -> usize

Initializes a with the given initializer. Read more

pub unsafe fn deref<'a>(ptr: usize) -> &'a T

Dereferences the given pointer. Read more

pub unsafe fn deref_mut<'a>(ptr: usize) -> &'a mut T

Mutably dereferences the given pointer. Read more

pub unsafe fn drop(ptr: usize)

Drops the object pointed to by the given pointer. Read more

impl<T> ToOwned for T where
    T: Clone,

[src]

type Owned = T

The resulting type after obtaining ownership.

pub fn to_owned(&self) -> T[src]

Creates owned data from borrowed data, usually by cloning. Read more

pub fn clone_into(&self, target: &mut T)[src]

🔬 This is a nightly-only experimental API. (toowned_clone_into)

recently added

Uses borrowed data to replace owned data, usually by cloning. Read more

impl<T, U> TryFrom<U> for T where
    U: Into<T>,

[src]

type Error = Infallible

The type returned in the event of a conversion error.

pub fn try_from(value: U) -> Result<T, <T as TryFrom<U>>::Error>[src]

Performs the conversion.

impl<T, U> TryInto<U> for T where
    U: TryFrom<T>,

[src]

type Error = <U as TryFrom<T>>::Error

The type returned in the event of a conversion error.

pub fn try_into(self) -> Result<U, <U as TryFrom<T>>::Error>[src]

Performs the conversion.