[−][src]Struct rust_tokenizers::tokenizer::RobertaTokenizer

pub struct RobertaTokenizer { /* fields omitted */ }

RoBERTa tokenizer

RoBERTa tokenizer performing:

splitting on special characters
whitespace splitting
(optional) lower casing
BPE tokenization

Implementations

`impl RobertaTokenizer`[src]

`pub fn from_file( vocab_path: &str, merges_path: &str, lower_case: bool, add_prefix_space: bool ) -> Result<RobertaTokenizer, TokenizerError>`[src]

Create a new instance of a RobertaTokenizer Expects a vocabulary json file and a merges file as an input.

Parameters

vocab_path (&str): path to the vocabulary file
merges_path (&str): path to the merges file (use as part of the BPE encoding process)
lower_case (bool): flag indicating if the text should be lower-cased as part of the tokenization

Example

use rust_tokenizers::tokenizer::{RobertaTokenizer, Tokenizer};
let lower_case = false;
let add_prefix_space = true;
let tokenizer = RobertaTokenizer::from_file(
    "path/to/vocab/file",
    "path/to/merges/file",
    lower_case,
    add_prefix_space,
)
.unwrap();

`pub fn from_existing_vocab_and_merges( vocab: RobertaVocab, merges: BpePairVocab, lower_case: bool, add_prefix_space: bool ) -> RobertaTokenizer`[src]

Create a new instance of a RobertaTokenizer from an existing vocabulary and merges

Parameters

vocab (RobertaVocab): GPT-like vocabulary
merges (BpePairVocab): BPE pairs vocabulary
lower_case (bool): flag indicating if the text should be lower-cased as part of the tokenization

Example

use rust_tokenizers::tokenizer::{RobertaTokenizer, Tokenizer};
use rust_tokenizers::vocab::{BpePairVocab, RobertaVocab, Vocab};
let lower_case = false;
let add_prefix_space = true;
let vocab = RobertaVocab::from_file("path/to/vocab/file").unwrap();
let merges = BpePairVocab::from_file("path/to/merges/file").unwrap();

let tokenizer = RobertaTokenizer::from_existing_vocab_and_merges(
    vocab,
    merges,
    lower_case,
    add_prefix_space,
);

Trait Implementations

`impl MultiThreadedTokenizer<RobertaVocab> for RobertaTokenizer`[src]

`fn vocab(&self) -> &T`[src]

`fn tokenize_list_with_offsets<S, ST>( &self, text_list: S ) -> Vec<TokensWithOffsets> where S: AsRef<[ST]>, ST: AsRef<str> + Sync,` [src]

`fn tokenize_list<S, ST>(&self, text_list: S) -> Vec<Vec<String>> where S: AsRef<[ST]>, ST: AsRef<str> + Sync,` [src]

`fn encode_list<S, ST>( &self, text_list: S, max_len: usize, truncation_strategy: &TruncationStrategy, stride: usize ) -> Vec<TokenizedInput> where S: AsRef<[ST]>, ST: AsRef<str> + Sync,` [src]

`fn encode_pair_list<S, ST>( &self, text_list: S, max_len: usize, truncation_strategy: &TruncationStrategy, stride: usize ) -> Vec<TokenizedInput> where S: AsRef<[(ST, ST)]>, ST: AsRef<str> + Sync,` [src]

`fn decode_list( &self, token_ids_list: Vec<Vec<i64>>, skip_special_tokens: bool, clean_up_tokenization_spaces: bool ) -> Vec<String>`[src]

`impl Tokenizer<RobertaVocab> for RobertaTokenizer`[src]

`fn vocab(&self) -> &RobertaVocab`[src]

`fn tokenize_to_tokens(&self, initial_token: TokenRef<'_>) -> Vec<Token>`[src]

`fn convert_tokens_to_string(&self, tokens: Vec<String>) -> String`[src]

`fn build_input_with_special_tokens( &self, tokens_ids_with_offsets_1: TokenIdsWithOffsets, tokens_ids_with_offsets_2: Option<TokenIdsWithOffsets> ) -> TokenIdsWithSpecialTokens`[src]

`fn tokenize<S: AsRef<str>>(&self, text: S) -> Vec<String>`[src]

`fn tokenize_with_offsets<S: AsRef<str>>(&self, text: S) -> TokensWithOffsets`[src]

`fn tokenize_list<S, ST>(&self, text_list: S) -> Vec<Vec<String>> where S: AsRef<[ST]>, ST: AsRef<str>,` [src]

`fn tokenize_list_with_offsets<S, ST>( &self, text_list: S ) -> Vec<TokensWithOffsets> where S: AsRef<[ST]>, ST: AsRef<str>,` [src]

`fn convert_tokens_to_ids<S, ST>(&self, tokens: S) -> Vec<i64> where S: AsRef<[ST]>, ST: AsRef<str>,` [src]

`fn encode<S: AsRef<str>>( &self, text_1: S, text_2: Option<S>, max_len: usize, truncation_strategy: &TruncationStrategy, stride: usize ) -> TokenizedInput`[src]

`fn encode_list<S, ST>( &self, text_list: S, max_len: usize, truncation_strategy: &TruncationStrategy, stride: usize ) -> Vec<TokenizedInput> where S: AsRef<[ST]>, ST: AsRef<str>,` [src]

`fn encode_pair_list<S, ST>( &self, text_list: S, max_len: usize, truncation_strategy: &TruncationStrategy, stride: usize ) -> Vec<TokenizedInput> where S: AsRef<[(ST, ST)]>, ST: AsRef<str>,` [src]

`fn decode_to_vec( &self, token_ids: Vec<i64>, skip_special_tokens: bool ) -> Vec<String>`[src]

`fn decode( &self, token_ids: Vec<i64>, skip_special_tokens: bool, clean_up_tokenization_spaces: bool ) -> String`[src]

`fn clean_up_tokenization(&self, input_string: String) -> String`[src]

`fn decode_list( &self, token_ids_list: Vec<Vec<i64>>, skip_special_tokens: bool, clean_up_tokenization_spaces: bool ) -> Vec<String>`[src]

Auto Trait Implementations

`impl !RefUnwindSafe for RobertaTokenizer`

`impl Send for RobertaTokenizer`

`impl Sync for RobertaTokenizer`

`impl Unpin for RobertaTokenizer`

`impl UnwindSafe for RobertaTokenizer`

Blanket Implementations

`impl<T> Any for T where T: 'static + ?Sized,` [src]

`pub fn type_id(&self) -> TypeId`[src]

`impl<T> Borrow<T> for T where T: ?Sized,` [src]

`pub fn borrow(&self) -> &T`[src]

`impl<T> BorrowMut<T> for T where T: ?Sized,` [src]

`pub fn borrow_mut(&mut self) -> &mut T`[src]

`impl<T> From<T> for T`[src]

`pub fn from(t: T) -> T`[src]

`impl<T, U> Into for T where U: From<T>,` [src]

`pub fn into(self) -> U`[src]

`impl<T> Pointable for T`

`pub const ALIGN: usize`

`type Init = T`

The type for initializers.

`pub unsafe fn init(init: <T as Pointable>::Init) -> usize`

`pub unsafe fn deref<'a>(ptr: usize) -> &'a T`

`pub unsafe fn deref_mut<'a>(ptr: usize) -> &'a mut T`

`pub unsafe fn drop(ptr: usize)`

`impl<T, U> TryFrom for T where U: Into<T>,` [src]

`type Error = Infallible`

The type returned in the event of a conversion error.

`pub fn try_from(value: U) -> Result<T, <T as TryFrom>::Error>`[src]

`impl<T, U> TryInto for T where U: TryFrom<T>,` [src]

`type Error = >::Error`

The type returned in the event of a conversion error.

`pub fn try_into(self) -> Result<U, >::Error>`[src]

[−][src]Struct rust_tokenizers::tokenizer::RobertaTokenizer

RoBERTa tokenizer

Implementations

impl RobertaTokenizer[src]

pub fn from_file( vocab_path: &str, merges_path: &str, lower_case: bool, add_prefix_space: bool) -> Result<RobertaTokenizer, TokenizerError>[src]

Parameters

Example

pub fn from_existing_vocab_and_merges( vocab: RobertaVocab, merges: BpePairVocab, lower_case: bool, add_prefix_space: bool) -> RobertaTokenizer[src]

Parameters

Example

Trait Implementations

impl MultiThreadedTokenizer<RobertaVocab> for RobertaTokenizer[src]

fn vocab(&self) -> &T[src]

fn tokenize_list_with_offsets<S, ST>( &self, text_list: S) -> Vec<TokensWithOffsets> where S: AsRef<[ST]>, ST: AsRef<str> + Sync, [src]

fn tokenize_list<S, ST>(&self, text_list: S) -> Vec<Vec<String>> where S: AsRef<[ST]>, ST: AsRef<str> + Sync, [src]

fn encode_list<S, ST>( &self, text_list: S, max_len: usize, truncation_strategy: &TruncationStrategy, stride: usize) -> Vec<TokenizedInput> where S: AsRef<[ST]>, ST: AsRef<str> + Sync, [src]

fn encode_pair_list<S, ST>( &self, text_list: S, max_len: usize, truncation_strategy: &TruncationStrategy, stride: usize) -> Vec<TokenizedInput> where S: AsRef<[(ST, ST)]>, ST: AsRef<str> + Sync, [src]

fn decode_list( &self, token_ids_list: Vec<Vec<i64>>, skip_special_tokens: bool, clean_up_tokenization_spaces: bool) -> Vec<String>[src]

impl Tokenizer<RobertaVocab> for RobertaTokenizer[src]

fn vocab(&self) -> &RobertaVocab[src]

fn tokenize_to_tokens(&self, initial_token: TokenRef<'_>) -> Vec<Token>[src]

fn convert_tokens_to_string(&self, tokens: Vec<String>) -> String[src]

fn build_input_with_special_tokens( &self, tokens_ids_with_offsets_1: TokenIdsWithOffsets, tokens_ids_with_offsets_2: Option<TokenIdsWithOffsets>) -> TokenIdsWithSpecialTokens[src]

fn tokenize<S: AsRef<str>>(&self, text: S) -> Vec<String>[src]

fn tokenize_with_offsets<S: AsRef<str>>(&self, text: S) -> TokensWithOffsets[src]

fn tokenize_list<S, ST>(&self, text_list: S) -> Vec<Vec<String>> where S: AsRef<[ST]>, ST: AsRef<str>, [src]

fn tokenize_list_with_offsets<S, ST>( &self, text_list: S) -> Vec<TokensWithOffsets> where S: AsRef<[ST]>, ST: AsRef<str>, [src]

fn convert_tokens_to_ids<S, ST>(&self, tokens: S) -> Vec<i64> where S: AsRef<[ST]>, ST: AsRef<str>, [src]

fn encode<S: AsRef<str>>( &self, text_1: S, text_2: Option<S>, max_len: usize, truncation_strategy: &TruncationStrategy, stride: usize) -> TokenizedInput[src]

fn encode_list<S, ST>( &self, text_list: S, max_len: usize, truncation_strategy: &TruncationStrategy, stride: usize) -> Vec<TokenizedInput> where S: AsRef<[ST]>, ST: AsRef<str>, [src]

fn encode_pair_list<S, ST>( &self, text_list: S, max_len: usize, truncation_strategy: &TruncationStrategy, stride: usize) -> Vec<TokenizedInput> where S: AsRef<[(ST, ST)]>, ST: AsRef<str>, [src]

fn decode_to_vec( &self, token_ids: Vec<i64>, skip_special_tokens: bool) -> Vec<String>[src]

fn decode( &self, token_ids: Vec<i64>, skip_special_tokens: bool, clean_up_tokenization_spaces: bool) -> String[src]

fn clean_up_tokenization(&self, input_string: String) -> String[src]

fn decode_list( &self, token_ids_list: Vec<Vec<i64>>, skip_special_tokens: bool, clean_up_tokenization_spaces: bool) -> Vec<String>[src]

Auto Trait Implementations

impl !RefUnwindSafe for RobertaTokenizer

impl Send for RobertaTokenizer

impl Sync for RobertaTokenizer

impl Unpin for RobertaTokenizer

impl UnwindSafe for RobertaTokenizer

Blanket Implementations

impl<T> Any for T where T: 'static + ?Sized, [src]

pub fn type_id(&self) -> TypeId[src]

impl<T> Borrow<T> for T where T: ?Sized, [src]

pub fn borrow(&self) -> &T[src]

impl<T> BorrowMut<T> for T where T: ?Sized, [src]

pub fn borrow_mut(&mut self) -> &mut T[src]

impl<T> From<T> for T[src]

pub fn from(t: T) -> T[src]

impl<T, U> Into<U> for T where U: From<T>, [src]

pub fn into(self) -> U[src]

impl<T> Pointable for T

pub const ALIGN: usize

type Init = T

pub unsafe fn init(init: <T as Pointable>::Init) -> usize

pub unsafe fn deref<'a>(ptr: usize) -> &'a T

pub unsafe fn deref_mut<'a>(ptr: usize) -> &'a mut T

pub unsafe fn drop(ptr: usize)

impl<T, U> TryFrom<U> for T where U: Into<T>, [src]

type Error = Infallible

pub fn try_from(value: U) -> Result<T, <T as TryFrom<U>>::Error>[src]

impl<T, U> TryInto<U> for T where U: TryFrom<T>, [src]

type Error = <U as TryFrom<T>>::Error

pub fn try_into(self) -> Result<U, <U as TryFrom<T>>::Error>[src]

`impl RobertaTokenizer`[src]

`pub fn from_file( vocab_path: &str, merges_path: &str, lower_case: bool, add_prefix_space: bool ) -> Result<RobertaTokenizer, TokenizerError>`[src]

`pub fn from_existing_vocab_and_merges( vocab: RobertaVocab, merges: BpePairVocab, lower_case: bool, add_prefix_space: bool ) -> RobertaTokenizer`[src]

`impl MultiThreadedTokenizer<RobertaVocab> for RobertaTokenizer`[src]

`fn vocab(&self) -> &T`[src]

`fn tokenize_list_with_offsets<S, ST>( &self, text_list: S ) -> Vec<TokensWithOffsets> where S: AsRef<[ST]>, ST: AsRef<str> + Sync,` [src]

`fn tokenize_list<S, ST>(&self, text_list: S) -> Vec<Vec<String>> where S: AsRef<[ST]>, ST: AsRef<str> + Sync,` [src]

`fn encode_list<S, ST>( &self, text_list: S, max_len: usize, truncation_strategy: &TruncationStrategy, stride: usize ) -> Vec<TokenizedInput> where S: AsRef<[ST]>, ST: AsRef<str> + Sync,` [src]

`fn encode_pair_list<S, ST>( &self, text_list: S, max_len: usize, truncation_strategy: &TruncationStrategy, stride: usize ) -> Vec<TokenizedInput> where S: AsRef<[(ST, ST)]>, ST: AsRef<str> + Sync,` [src]

`fn decode_list( &self, token_ids_list: Vec<Vec<i64>>, skip_special_tokens: bool, clean_up_tokenization_spaces: bool ) -> Vec<String>`[src]

`impl Tokenizer<RobertaVocab> for RobertaTokenizer`[src]

`fn vocab(&self) -> &RobertaVocab`[src]

`fn tokenize_to_tokens(&self, initial_token: TokenRef<'_>) -> Vec<Token>`[src]

`fn convert_tokens_to_string(&self, tokens: Vec<String>) -> String`[src]

`fn build_input_with_special_tokens( &self, tokens_ids_with_offsets_1: TokenIdsWithOffsets, tokens_ids_with_offsets_2: Option<TokenIdsWithOffsets> ) -> TokenIdsWithSpecialTokens`[src]

`fn tokenize<S: AsRef<str>>(&self, text: S) -> Vec<String>`[src]

`fn tokenize_with_offsets<S: AsRef<str>>(&self, text: S) -> TokensWithOffsets`[src]

`fn tokenize_list<S, ST>(&self, text_list: S) -> Vec<Vec<String>> where S: AsRef<[ST]>, ST: AsRef<str>,` [src]

`fn tokenize_list_with_offsets<S, ST>( &self, text_list: S ) -> Vec<TokensWithOffsets> where S: AsRef<[ST]>, ST: AsRef<str>,` [src]

`fn convert_tokens_to_ids<S, ST>(&self, tokens: S) -> Vec<i64> where S: AsRef<[ST]>, ST: AsRef<str>,` [src]

`fn encode<S: AsRef<str>>( &self, text_1: S, text_2: Option<S>, max_len: usize, truncation_strategy: &TruncationStrategy, stride: usize ) -> TokenizedInput`[src]

`fn encode_list<S, ST>( &self, text_list: S, max_len: usize, truncation_strategy: &TruncationStrategy, stride: usize ) -> Vec<TokenizedInput> where S: AsRef<[ST]>, ST: AsRef<str>,` [src]

`fn encode_pair_list<S, ST>( &self, text_list: S, max_len: usize, truncation_strategy: &TruncationStrategy, stride: usize ) -> Vec<TokenizedInput> where S: AsRef<[(ST, ST)]>, ST: AsRef<str>,` [src]

`fn decode_to_vec( &self, token_ids: Vec<i64>, skip_special_tokens: bool ) -> Vec<String>`[src]

`fn decode( &self, token_ids: Vec<i64>, skip_special_tokens: bool, clean_up_tokenization_spaces: bool ) -> String`[src]

`fn clean_up_tokenization(&self, input_string: String) -> String`[src]

`fn decode_list( &self, token_ids_list: Vec<Vec<i64>>, skip_special_tokens: bool, clean_up_tokenization_spaces: bool ) -> Vec<String>`[src]

`impl !RefUnwindSafe for RobertaTokenizer`

`impl Send for RobertaTokenizer`

`impl Sync for RobertaTokenizer`

`impl Unpin for RobertaTokenizer`

`impl UnwindSafe for RobertaTokenizer`

`impl<T> Any for T where T: 'static + ?Sized,` [src]

`pub fn type_id(&self) -> TypeId`[src]

`impl<T> Borrow<T> for T where T: ?Sized,` [src]

`pub fn borrow(&self) -> &T`[src]

`impl<T> BorrowMut<T> for T where T: ?Sized,` [src]

`pub fn borrow_mut(&mut self) -> &mut T`[src]

`impl<T> From<T> for T`[src]

`pub fn from(t: T) -> T`[src]

`impl<T, U> Into<U> for T where U: From<T>,` [src]

`pub fn into(self) -> U`[src]

`impl<T> Pointable for T`

`pub const ALIGN: usize`

`type Init = T`

`pub unsafe fn init(init: <T as Pointable>::Init) -> usize`

`pub unsafe fn deref<'a>(ptr: usize) -> &'a T`

`pub unsafe fn deref_mut<'a>(ptr: usize) -> &'a mut T`

`pub unsafe fn drop(ptr: usize)`

`impl<T, U> TryFrom<U> for T where U: Into<T>,` [src]

`type Error = Infallible`

`pub fn try_from(value: U) -> Result<T, <T as TryFrom<U>>::Error>`[src]

`impl<T, U> TryInto<U> for T where U: TryFrom<T>,` [src]

`type Error = <U as TryFrom<T>>::Error`

`pub fn try_into(self) -> Result<U, <U as TryFrom<T>>::Error>`[src]