Struct rust_tokenizers::tokenizer::AlbertTokenizer[−][src]

pub struct AlbertTokenizer { /* fields omitted */ }

Expand description

ALBERT tokenizer

ALBERT tokenizer performing:

splitting on special characters
text cleaning
NFKC decomposition
(optional) lower casing
(optional) accent stripping
SentencePiece decomposition

Implementations

[src]

impl AlbertTokenizer

[src]

pub fn from_file(
 path: &str,
 lower_case: bool,
 strip_accents: bool
) -> Result<AlbertTokenizer, TokenizerError>

Create a new instance of a AlbertTokenizer Expects a SentencePiece protobuf file as an input.

Parameters

path (&str): path to the SentencePiece model file
lower_case (bool): flag indicating if the text should be lower-cased as part of the tokenization
strip_accents (bool): flag indicating if accents should be stripped from the text

pub fn from_existing_vocab_and_model(
    vocab: AlbertVocab,
    model: SentencePieceModel,
    lower_case: bool,
    strip_accents: bool
) -> AlbertTokenizer

Create a new instance of a AlbertTokenizer from an existing vocabulary and model

Parameters

vocab (AlbertVocab): vocabulary
model (SentencePieceModel): SentencePiece model
lower_case (bool): flag indicating if the text should be lower-cased as part of the tokenization
strip_accents (bool): flag indicating if accents should be stripped from the text

Example

use rust_tokenizers::tokenizer::{AlbertTokenizer, Tokenizer};
use rust_tokenizers::vocab::{AlbertVocab, SentencePieceModel, Vocab};
let strip_accents = false;
let lower_case = false;
let vocab = AlbertVocab::from_file("path/to/vocab/file").unwrap();
let model = SentencePieceModel::from_file("path/to/model/file").unwrap();

let tokenizer =
    AlbertTokenizer::from_existing_vocab_and_model(vocab, model, lower_case, strip_accents);

Trait Implementations

[src]

impl MultiThreadedTokenizer<AlbertVocab> for AlbertTokenizer

[src]

fn vocab(&self) -> &T

returns a reference to the tokenizer vocabulary

[src]

fn tokenize_list_with_offsets<S>(
 &self,
 text_list: &[S]
) -> Vec<TokensWithOffsets>ⓘ
Notable traits for Vec<u8, A>
`impl<A> Write for Vec<u8, A> where A: Allocator,`
where
 S: AsRef<str> + Sync,

Tokenize a list of strings (with multithreading), where each corresponds to for example a sentence, returns a vector of TokensWithOffsets containing the tokens and their offset information. This calls tokenize_with_offsets on the list provided. Read more

[src]

fn tokenize_list<S>(&self, text_list: &[S]) -> Vec<Vec<String>>ⓘ
Notable traits for Vec<u8, A>
`impl<A> Write for Vec<u8, A> where A: Allocator,`
where
S: AsRef<str> + Sync,

Multithreaded tokenization of a list of strings, returning tokens with offset information Read more

[src]

fn encode_list<S>(
 &self,
 text_list: &[S],
 max_len: usize,
 truncation_strategy: &TruncationStrategy,
 stride: usize
) -> Vec<TokenizedInput>ⓘ
Notable traits for Vec<u8, A>
`impl<A> Write for Vec<u8, A> where A: Allocator,`
where
 S: AsRef<str> + Sync,

Multithreaded encoding of a sequence of string-like texts (tokenization followed by encoding). Not that in contrast with encode optional second text, each text provided is encoded independently. Read more

[src]

fn encode_pair_list<S>(
 &self,
 text_list: &[(S, S)],
 max_len: usize,
 truncation_strategy: &TruncationStrategy,
 stride: usize
) -> Vec<TokenizedInput>ⓘ
Notable traits for Vec<u8, A>
`impl<A> Write for Vec<u8, A> where A: Allocator,`
where
 S: AsRef<str> + Sync,

Multithreaded ncoding of a sequence of string-like text pairs (tokenization followed by encoding). This combines with encode with the list processing of encode_list. Read more

[src]

fn decode_list(
 &self,
 token_ids_list: &[Vec<i64>],
 skip_special_tokens: bool,
 clean_up_tokenization_spaces: bool
) -> Vec<String>ⓘ
Notable traits for Vec<u8, A>
`impl<A> Write for Vec<u8, A> where A: Allocator,`

Multithreaded conversion a list of sequence of ids (integer) into a string, using the tokenizer and vocabulary with options to remove special tokens and clean up tokenization spaces. This calls decode for each provided sequence of ids Read more

[src]

impl Tokenizer<AlbertVocab> for AlbertTokenizer

[src]

fn vocab(&self) -> &AlbertVocab

returns a reference to the tokenizer vocabulary

[src]

fn tokenize_to_tokens(&self, text: TokenRef<'_>) -> Vec<Token>ⓘ
Notable traits for Vec<u8, A>
`impl<A> Write for Vec<u8, A> where A: Allocator,`

Tokenize a TokenRef, returning a sequence of tokens Read more

[src]

fn convert_tokens_to_string(&self, tokens: Vec<String>) -> String

Converts a sequence of strings into a single string. This will clean-up artifacts from tokenization (for example sub ##word) and generate a single output string Read more

[src]

fn build_input_with_special_tokens(
 &self,
 tokens_ids_with_offsets_1: TokenIdsWithOffsets,
 tokens_ids_with_offsets_2: Option<TokenIdsWithOffsets>
) -> TokenIdsWithSpecialTokens

Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and adding special tokens. Read more

[src]

fn tokenize(&self, text: &str) -> Vec<String>ⓘ
Notable traits for Vec<u8, A>
`impl<A> Write for Vec<u8, A> where A: Allocator,`

Tokenize a string, returns a vector of tokens as strings. Use tokenize_with_offsets or tokenize_to_tokens to return offset information. Read more

[src]

fn tokenize_with_offsets(&self, text: &str) -> TokensWithOffsets

Tokenize a string, returning tokens with offset information Read more

[src]

fn tokenize_list<S>(&self, text_list: &[S]) -> Vec<Vec<String>>ⓘ
Notable traits for Vec<u8, A>
`impl<A> Write for Vec<u8, A> where A: Allocator,`
where
S: AsRef<str>,

Tokenize a list of strings, returning tokens with offset information Read more

[src]

fn tokenize_list_with_offsets<S>(
 &self,
 text_list: &[S]
) -> Vec<TokensWithOffsets>ⓘ
Notable traits for Vec<u8, A>
`impl<A> Write for Vec<u8, A> where A: Allocator,`
where
 S: AsRef<str>,

Tokenize a list of strings, where each corresponds to for example a sentence, returns a vector of TokensWithOffsets containing the tokens and their offset information. This calls tokenize_with_offsets on the list provided. Read more

[src]

fn convert_tokens_to_ids<S>(&self, tokens: &[S]) -> Vec<i64>ⓘ
Notable traits for Vec<u8, A>
`impl<A> Write for Vec<u8, A> where A: Allocator,`
where
S: AsRef<str>,

Convert a slice of string-like to a vector ot token indices Read more

[src]

fn encode(
 &self,
 text_1: &str,
 text_2: Option<&str>,
 max_len: usize,
 truncation_strategy: &TruncationStrategy,
 stride: usize
) -> TokenizedInput

Encode a string-like (tokenization followed by encoding) Read more

[src]

fn encode_list<S>(
 &self,
 text_list: &[S],
 max_len: usize,
 truncation_strategy: &TruncationStrategy,
 stride: usize
) -> Vec<TokenizedInput>ⓘ
Notable traits for Vec<u8, A>
`impl<A> Write for Vec<u8, A> where A: Allocator,`
where
 S: AsRef<str>,

Encode a sequence of string-like texts (tokenization followed by encoding). Not that in contrast with encode optional second text, each text provided is encoded independently. Read more

[src]

fn encode_pair_list<S>(
 &self,
 text_list: &[(S, S)],
 max_len: usize,
 truncation_strategy: &TruncationStrategy,
 stride: usize
) -> Vec<TokenizedInput>ⓘ
Notable traits for Vec<u8, A>
`impl<A> Write for Vec<u8, A> where A: Allocator,`
where
 S: AsRef<str>,

Encode a sequence of string-like text pairs (tokenization followed by encoding). This combines with encode with the list processing of encode_list. Read more

[src]

fn decode_to_vec(
 &self,
 token_ids: &[i64 ],
 skip_special_tokens: bool
) -> Vec<String>ⓘ
Notable traits for Vec<u8, A>
`impl<A> Write for Vec<u8, A> where A: Allocator,`

Decode a sequence of token indices to a sequence of Strings, optionally skipping special indices Read more

[src]

fn decode(
    &self,
    token_ids: &[i64 ],
    skip_special_tokens: bool,
    clean_up_tokenization_spaces: bool
) -> String

Converts a sequence of ids (integer) into a string, using the tokenizer and vocabulary with options to remove special tokens and clean up tokenization spaces. Read more

[src]

fn clean_up_tokenization(&self, input_string: String) -> String

Cleans-up tokenization artifacts (for example whitespace before punctuation) Read more

[src]

fn decode_list(
 &self,
 token_ids_list: &[Vec<i64>],
 skip_special_tokens: bool,
 clean_up_tokenization_spaces: bool
) -> Vec<String>ⓘ
Notable traits for Vec<u8, A>
`impl<A> Write for Vec<u8, A> where A: Allocator,`

Converts a list of sequence of ids (integer) into a string, using the tokenizer and vocabulary with options to remove special tokens and clean up tokenization spaces. This calls decode for each provided sequence of ids Read more

Auto Trait Implementations

impl RefUnwindSafe for AlbertTokenizer

impl Send for AlbertTokenizer

impl Sync for AlbertTokenizer

impl Unpin for AlbertTokenizer

impl UnwindSafe for AlbertTokenizer

Blanket Implementations

[src]

impl<T> Any for T where
T: 'static + ?Sized,

[src]

pub fn type_id(&self) -> TypeId

Gets the TypeId of self. Read more

[src]

impl<T> Borrow<T> for T where
T: ?Sized,

[src]

pub fn borrow(&self) -> &T

Immutably borrows from an owned value. Read more

[src]

impl<T> BorrowMut<T> for T where
T: ?Sized,

[src]

pub fn borrow_mut(&mut self) -> &mut T

Mutably borrows from an owned value. Read more

[src]

impl<T> From<T> for T

[src]

pub fn from(t: T) -> T

Performs the conversion.

[src]

impl<T, U> Into for T where
U: From<T>,

[src]

pub fn into(self) -> U

Performs the conversion.

impl<T> Pointable for T

pub const ALIGN: usize

The alignment of pointer.

type Init = T

The type for initializers.

pub unsafe fn init(init: <T as Pointable>::Init) -> usize

Initializes a with the given initializer. Read more

pub unsafe fn deref<'a>(ptr: usize) -> &'a T

Dereferences the given pointer. Read more

pub unsafe fn deref_mut<'a>(ptr: usize) -> &'a mut T

Mutably dereferences the given pointer. Read more

pub unsafe fn drop(ptr: usize)

Drops the object pointed to by the given pointer. Read more

[src]

impl<T, U> TryFrom for T where
U: Into<T>,

type Error = Infallible

The type returned in the event of a conversion error.

[src]

pub fn try_from(value: U) -> Result<T, <T as TryFrom>::Error>

Performs the conversion.

[src]

impl<T, U> TryInto for T where
U: TryFrom<T>,

type Error = >::Error

The type returned in the event of a conversion error.

[src]

pub fn try_into(self) -> Result<U, >::Error>

Performs the conversion.

Struct rust_tokenizers::tokenizer::AlbertTokenizer[−][src]

Implementations

impl AlbertTokenizer

pub fn from_file( path: &str, lower_case: bool, strip_accents: bool) -> Result<AlbertTokenizer, TokenizerError>

pub fn from_existing_vocab_and_model( vocab: AlbertVocab, model: SentencePieceModel, lower_case: bool, strip_accents: bool) -> AlbertTokenizer

Trait Implementations

impl MultiThreadedTokenizer<AlbertVocab> for AlbertTokenizer

fn vocab(&self) -> &T

fn tokenize_list_with_offsets<S>( &self, text_list: &[S]) -> Vec<TokensWithOffsets>ⓘNotable traits for Vec<u8, A>impl<A> Write for Vec<u8, A> where A: Allocator, where S: AsRef<str> + Sync,

fn tokenize_list<S>(&self, text_list: &[S]) -> Vec<Vec<String>>ⓘNotable traits for Vec<u8, A>impl<A> Write for Vec<u8, A> where A: Allocator, where S: AsRef<str> + Sync,

fn encode_list<S>( &self, text_list: &[S], max_len: usize, truncation_strategy: &TruncationStrategy, stride: usize) -> Vec<TokenizedInput>ⓘNotable traits for Vec<u8, A>impl<A> Write for Vec<u8, A> where A: Allocator, where S: AsRef<str> + Sync,

fn encode_pair_list<S>( &self, text_list: &[(S, S)], max_len: usize, truncation_strategy: &TruncationStrategy, stride: usize) -> Vec<TokenizedInput>ⓘNotable traits for Vec<u8, A>impl<A> Write for Vec<u8, A> where A: Allocator, where S: AsRef<str> + Sync,

fn decode_list( &self, token_ids_list: &[Vec<i64>], skip_special_tokens: bool, clean_up_tokenization_spaces: bool) -> Vec<String>ⓘNotable traits for Vec<u8, A>impl<A> Write for Vec<u8, A> where A: Allocator,

impl Tokenizer<AlbertVocab> for AlbertTokenizer

fn vocab(&self) -> &AlbertVocab

fn tokenize_to_tokens(&self, text: TokenRef<'_>) -> Vec<Token>ⓘNotable traits for Vec<u8, A>impl<A> Write for Vec<u8, A> where A: Allocator,

fn convert_tokens_to_string(&self, tokens: Vec<String>) -> String

fn build_input_with_special_tokens( &self, tokens_ids_with_offsets_1: TokenIdsWithOffsets, tokens_ids_with_offsets_2: Option<TokenIdsWithOffsets>) -> TokenIdsWithSpecialTokens

fn tokenize(&self, text: &str) -> Vec<String>ⓘNotable traits for Vec<u8, A>impl<A> Write for Vec<u8, A> where A: Allocator,

fn tokenize_with_offsets(&self, text: &str) -> TokensWithOffsets

fn tokenize_list<S>(&self, text_list: &[S]) -> Vec<Vec<String>>ⓘNotable traits for Vec<u8, A>impl<A> Write for Vec<u8, A> where A: Allocator, where S: AsRef<str>,

fn tokenize_list_with_offsets<S>( &self, text_list: &[S]) -> Vec<TokensWithOffsets>ⓘNotable traits for Vec<u8, A>impl<A> Write for Vec<u8, A> where A: Allocator, where S: AsRef<str>,

fn convert_tokens_to_ids<S>(&self, tokens: &[S]) -> Vec<i64>ⓘNotable traits for Vec<u8, A>impl<A> Write for Vec<u8, A> where A: Allocator, where S: AsRef<str>,

fn encode( &self, text_1: &str, text_2: Option<&str>, max_len: usize, truncation_strategy: &TruncationStrategy, stride: usize) -> TokenizedInput

fn encode_list<S>( &self, text_list: &[S], max_len: usize, truncation_strategy: &TruncationStrategy, stride: usize) -> Vec<TokenizedInput>ⓘNotable traits for Vec<u8, A>impl<A> Write for Vec<u8, A> where A: Allocator, where S: AsRef<str>,

fn encode_pair_list<S>( &self, text_list: &[(S, S)], max_len: usize, truncation_strategy: &TruncationStrategy, stride: usize) -> Vec<TokenizedInput>ⓘNotable traits for Vec<u8, A>impl<A> Write for Vec<u8, A> where A: Allocator, where S: AsRef<str>,

fn decode_to_vec( &self, token_ids: &[i64], skip_special_tokens: bool) -> Vec<String>ⓘNotable traits for Vec<u8, A>impl<A> Write for Vec<u8, A> where A: Allocator,

fn decode( &self, token_ids: &[i64], skip_special_tokens: bool, clean_up_tokenization_spaces: bool) -> String

fn clean_up_tokenization(&self, input_string: String) -> String

fn decode_list( &self, token_ids_list: &[Vec<i64>], skip_special_tokens: bool, clean_up_tokenization_spaces: bool) -> Vec<String>ⓘNotable traits for Vec<u8, A>impl<A> Write for Vec<u8, A> where A: Allocator,

Auto Trait Implementations

impl RefUnwindSafe for AlbertTokenizer

impl Send for AlbertTokenizer

impl Sync for AlbertTokenizer

impl Unpin for AlbertTokenizer

impl UnwindSafe for AlbertTokenizer

Blanket Implementations

impl<T> Any for T where T: 'static + ?Sized,

pub fn type_id(&self) -> TypeId

impl<T> Borrow<T> for T where T: ?Sized,

pub fn borrow(&self) -> &T

impl<T> BorrowMut<T> for T where T: ?Sized,

pub fn borrow_mut(&mut self) -> &mut T

impl<T> From<T> for T

pub fn from(t: T) -> T

impl<T, U> Into<U> for T where U: From<T>,

pub fn into(self) -> U

impl<T> Pointable for T

pub const ALIGN: usize

type Init = T

pub unsafe fn init(init: <T as Pointable>::Init) -> usize

pub unsafe fn deref<'a>(ptr: usize) -> &'a T

pub unsafe fn deref_mut<'a>(ptr: usize) -> &'a mut T

pub unsafe fn drop(ptr: usize)

impl<T, U> TryFrom<U> for T where U: Into<T>,

type Error = Infallible

pub fn try_from(value: U) -> Result<T, <T as TryFrom<U>>::Error>

impl<T, U> TryInto<U> for T where U: TryFrom<T>,

type Error = <U as TryFrom<T>>::Error

pub fn try_into(self) -> Result<U, <U as TryFrom<T>>::Error>

pub fn from_file(
path: &str,
lower_case: bool,
strip_accents: bool
) -> Result<AlbertTokenizer, TokenizerError>

pub fn from_existing_vocab_and_model(
vocab: AlbertVocab,
model: SentencePieceModel,
lower_case: bool,
strip_accents: bool
) -> AlbertTokenizer

fn tokenize_list_with_offsets<S>(
&self,
text_list: &[S]
) -> Vec<TokensWithOffsets>ⓘ
Notable traits for Vec<u8, A>
`impl<A> Write for Vec<u8, A> where A: Allocator,`
where
S: AsRef<str> + Sync,

fn tokenize_list<S>(&self, text_list: &[S]) -> Vec<Vec<String>>ⓘ
Notable traits for Vec<u8, A>
`impl<A> Write for Vec<u8, A> where A: Allocator,`
where
S: AsRef<str> + Sync,

fn encode_list<S>(
&self,
text_list: &[S],
max_len: usize,
truncation_strategy: &TruncationStrategy,
stride: usize
) -> Vec<TokenizedInput>ⓘ
Notable traits for Vec<u8, A>
`impl<A> Write for Vec<u8, A> where A: Allocator,`
where
S: AsRef<str> + Sync,

fn encode_pair_list<S>(
&self,
text_list: &[(S, S)],
max_len: usize,
truncation_strategy: &TruncationStrategy,
stride: usize
) -> Vec<TokenizedInput>ⓘ
Notable traits for Vec<u8, A>
`impl<A> Write for Vec<u8, A> where A: Allocator,`
where
S: AsRef<str> + Sync,

fn decode_list(
&self,
token_ids_list: &[Vec<i64>],
skip_special_tokens: bool,
clean_up_tokenization_spaces: bool
) -> Vec<String>ⓘ
Notable traits for Vec<u8, A>
`impl<A> Write for Vec<u8, A> where A: Allocator,`

fn tokenize_to_tokens(&self, text: TokenRef<'_>) -> Vec<Token>ⓘ
Notable traits for Vec<u8, A>
`impl<A> Write for Vec<u8, A> where A: Allocator,`

fn build_input_with_special_tokens(
&self,
tokens_ids_with_offsets_1: TokenIdsWithOffsets,
tokens_ids_with_offsets_2: Option<TokenIdsWithOffsets>
) -> TokenIdsWithSpecialTokens

fn tokenize(&self, text: &str) -> Vec<String>ⓘ
Notable traits for Vec<u8, A>
`impl<A> Write for Vec<u8, A> where A: Allocator,`

fn tokenize_list<S>(&self, text_list: &[S]) -> Vec<Vec<String>>ⓘ
Notable traits for Vec<u8, A>
`impl<A> Write for Vec<u8, A> where A: Allocator,`
where
S: AsRef<str>,

fn tokenize_list_with_offsets<S>(
&self,
text_list: &[S]
) -> Vec<TokensWithOffsets>ⓘ
Notable traits for Vec<u8, A>
`impl<A> Write for Vec<u8, A> where A: Allocator,`
where
S: AsRef<str>,

fn convert_tokens_to_ids<S>(&self, tokens: &[S]) -> Vec<i64>ⓘ
Notable traits for Vec<u8, A>
`impl<A> Write for Vec<u8, A> where A: Allocator,`
where
S: AsRef<str>,

fn encode(
&self,
text_1: &str,
text_2: Option<&str>,
max_len: usize,
truncation_strategy: &TruncationStrategy,
stride: usize
) -> TokenizedInput

fn encode_list<S>(
&self,
text_list: &[S],
max_len: usize,
truncation_strategy: &TruncationStrategy,
stride: usize
) -> Vec<TokenizedInput>ⓘ
Notable traits for Vec<u8, A>
`impl<A> Write for Vec<u8, A> where A: Allocator,`
where
S: AsRef<str>,

fn encode_pair_list<S>(
&self,
text_list: &[(S, S)],
max_len: usize,
truncation_strategy: &TruncationStrategy,
stride: usize
) -> Vec<TokenizedInput>ⓘ
Notable traits for Vec<u8, A>
`impl<A> Write for Vec<u8, A> where A: Allocator,`
where
S: AsRef<str>,

fn decode_to_vec(
&self,
token_ids: &[i64 ],
skip_special_tokens: bool
) -> Vec<String>ⓘ
Notable traits for Vec<u8, A>
`impl<A> Write for Vec<u8, A> where A: Allocator,`

fn decode(
&self,
token_ids: &[i64 ],
skip_special_tokens: bool,
clean_up_tokenization_spaces: bool
) -> String

fn decode_list(
&self,
token_ids_list: &[Vec<i64>],
skip_special_tokens: bool,
clean_up_tokenization_spaces: bool
) -> Vec<String>ⓘ
Notable traits for Vec<u8, A>
`impl<A> Write for Vec<u8, A> where A: Allocator,`

impl<T> Any for T where
T: 'static + ?Sized,

impl<T> Borrow<T> for T where
T: ?Sized,

impl<T> BorrowMut<T> for T where
T: ?Sized,

impl<T, U> Into<U> for T where
U: From<T>,

impl<T, U> TryFrom<U> for T where
U: Into<T>,

impl<T, U> TryInto<U> for T where
U: TryFrom<T>,