Struct rust_tokenizers::tokenizer::BertTokenizer[−][src]

pub struct BertTokenizer { /* fields omitted */ }

BERT tokenizer

BERT tokenizer performing:

BaseTokenizer tokenization (see BaseTokenizer for more details)
WordPiece tokenization

Implementations

`impl BertTokenizer`[src]

`pub fn from_file( path: &str, lower_case: bool, strip_accents: bool ) -> Result<BertTokenizer, TokenizerError>`[src]

Create a new instance of a BertTokenizer Expects a vocabulary flat-file as an input.

Parameters

path (&str): path to the vocabulary file
lower_case (bool): flag indicating if the text should be lower-cased as part of the tokenization
strip_accents (bool): flag indicating if accents should be stripped from the text

Example

use rust_tokenizers::tokenizer::{BertTokenizer, Tokenizer};
let strip_accents = false;
let lower_case = false;
let tokenizer =
    BertTokenizer::from_file("path/to/vocab/file", lower_case, strip_accents).unwrap();

`pub fn from_existing_vocab( vocab: BertVocab, lower_case: bool, strip_accents: bool ) -> BertTokenizer`[src]

Create a new instance of a BertTokenizer from an existing vocabulary

Parameters

vocab (BertVocab): Thread-safe reference to a BERT vocabulary
lower_case (bool): flag indicating if the text should be lower-cased as part of the tokenization
strip_accents (bool): flag indicating if accents should be stripped from the text

Example

use rust_tokenizers::tokenizer::{BertTokenizer, Tokenizer};
use rust_tokenizers::vocab::{BertVocab, Vocab};
let strip_accents = false;
let lower_case = false;
let vocab = BertVocab::from_file("path/to/vocab/file").unwrap();

let tokenizer = BertTokenizer::from_existing_vocab(vocab, lower_case, strip_accents);

Trait Implementations

`impl MultiThreadedTokenizer<BertVocab> for BertTokenizer`[src]

`fn vocab(&self) -> &T`[src]

`fn tokenize_list_with_offsets<S, ST>( &self, text_list: S ) -> Vec<TokensWithOffsets> where S: AsRef<[ST]>, ST: AsRef<str> + Sync,` [src]

`fn tokenize_list<S, ST>(&self, text_list: S) -> Vec<Vec<String>> where S: AsRef<[ST]>, ST: AsRef<str> + Sync,` [src]

`fn encode_list<S, ST>( &self, text_list: S, max_len: usize, truncation_strategy: &TruncationStrategy, stride: usize ) -> Vec<TokenizedInput> where S: AsRef<[ST]>, ST: AsRef<str> + Sync,` [src]

`fn encode_pair_list<S, ST>( &self, text_list: S, max_len: usize, truncation_strategy: &TruncationStrategy, stride: usize ) -> Vec<TokenizedInput> where S: AsRef<[(ST, ST)]>, ST: AsRef<str> + Sync,` [src]

`fn decode_list( &self, token_ids_list: Vec<Vec<i64>>, skip_special_tokens: bool, clean_up_tokenization_spaces: bool ) -> Vec<String>`[src]

`impl Tokenizer<BertVocab> for BertTokenizer`[src]

`fn vocab(&self) -> &BertVocab`[src]

`fn tokenize_to_tokens(&self, initial_token: TokenRef<'_>) -> Vec<Token>`[src]

`fn convert_tokens_to_string(&self, tokens: Vec<String>) -> String`[src]

`fn build_input_with_special_tokens( &self, tokens_ids_with_offsets_1: TokenIdsWithOffsets, tokens_ids_with_offsets_2: Option<TokenIdsWithOffsets> ) -> TokenIdsWithSpecialTokens`[src]

`fn tokenize<S: AsRef<str>>(&self, text: S) -> Vec<String>`[src]

`fn tokenize_with_offsets<S: AsRef<str>>(&self, text: S) -> TokensWithOffsets`[src]

`fn tokenize_list<S, ST>(&self, text_list: S) -> Vec<Vec<String>> where S: AsRef<[ST]>, ST: AsRef<str>,` [src]

`fn tokenize_list_with_offsets<S, ST>( &self, text_list: S ) -> Vec<TokensWithOffsets> where S: AsRef<[ST]>, ST: AsRef<str>,` [src]

`fn convert_tokens_to_ids<S, ST>(&self, tokens: S) -> Vec<i64> where S: AsRef<[ST]>, ST: AsRef<str>,` [src]

`fn encode<S: AsRef<str>>( &self, text_1: S, text_2: Option<S>, max_len: usize, truncation_strategy: &TruncationStrategy, stride: usize ) -> TokenizedInput`[src]

`fn encode_list<S, ST>( &self, text_list: S, max_len: usize, truncation_strategy: &TruncationStrategy, stride: usize ) -> Vec<TokenizedInput> where S: AsRef<[ST]>, ST: AsRef<str>,` [src]

`fn encode_pair_list<S, ST>( &self, text_list: S, max_len: usize, truncation_strategy: &TruncationStrategy, stride: usize ) -> Vec<TokenizedInput> where S: AsRef<[(ST, ST)]>, ST: AsRef<str>,` [src]

`fn decode_to_vec( &self, token_ids: Vec<i64>, skip_special_tokens: bool ) -> Vec<String>`[src]

`fn decode( &self, token_ids: Vec<i64>, skip_special_tokens: bool, clean_up_tokenization_spaces: bool ) -> String`[src]

`fn clean_up_tokenization(&self, input_string: String) -> String`[src]

`fn decode_list( &self, token_ids_list: Vec<Vec<i64>>, skip_special_tokens: bool, clean_up_tokenization_spaces: bool ) -> Vec<String>`[src]

Auto Trait Implementations

`impl RefUnwindSafe for BertTokenizer`

`impl Send for BertTokenizer`

`impl Sync for BertTokenizer`

`impl Unpin for BertTokenizer`

`impl UnwindSafe for BertTokenizer`

Blanket Implementations

`impl<T> Any for T where T: 'static + ?Sized,` [src]

`pub fn type_id(&self) -> TypeId`[src]

`impl<T> Borrow<T> for T where T: ?Sized,` [src]

`pub fn borrow(&self) -> &T`[src]

`impl<T> BorrowMut<T> for T where T: ?Sized,` [src]

`pub fn borrow_mut(&mut self) -> &mut T`[src]

`impl<T> From<T> for T`[src]

`pub fn from(t: T) -> T`[src]

`impl<T, U> Into for T where U: From<T>,` [src]

`pub fn into(self) -> U`[src]

`impl<T> Pointable for T`

`pub const ALIGN: usize`

`type Init = T`

The type for initializers.

`pub unsafe fn init(init: <T as Pointable>::Init) -> usize`

`pub unsafe fn deref<'a>(ptr: usize) -> &'a T`

`pub unsafe fn deref_mut<'a>(ptr: usize) -> &'a mut T`

`pub unsafe fn drop(ptr: usize)`

`impl<T, U> TryFrom for T where U: Into<T>,` [src]

`type Error = Infallible`

The type returned in the event of a conversion error.

`pub fn try_from(value: U) -> Result<T, <T as TryFrom>::Error>`[src]

`impl<T, U> TryInto for T where U: TryFrom<T>,` [src]

`type Error = >::Error`

The type returned in the event of a conversion error.

`pub fn try_into(self) -> Result<U, >::Error>`[src]

Struct rust_tokenizers::tokenizer::BertTokenizer[−][src]

BERT tokenizer

Implementations

impl BertTokenizer[src]

pub fn from_file( path: &str, lower_case: bool, strip_accents: bool) -> Result<BertTokenizer, TokenizerError>[src]

Parameters

Example

pub fn from_existing_vocab( vocab: BertVocab, lower_case: bool, strip_accents: bool) -> BertTokenizer[src]

Parameters

Example

Trait Implementations

impl MultiThreadedTokenizer<BertVocab> for BertTokenizer[src]

fn vocab(&self) -> &T[src]

fn tokenize_list_with_offsets<S, ST>( &self, text_list: S) -> Vec<TokensWithOffsets> where S: AsRef<[ST]>, ST: AsRef<str> + Sync, [src]

fn tokenize_list<S, ST>(&self, text_list: S) -> Vec<Vec<String>> where S: AsRef<[ST]>, ST: AsRef<str> + Sync, [src]

fn encode_list<S, ST>( &self, text_list: S, max_len: usize, truncation_strategy: &TruncationStrategy, stride: usize) -> Vec<TokenizedInput> where S: AsRef<[ST]>, ST: AsRef<str> + Sync, [src]

fn encode_pair_list<S, ST>( &self, text_list: S, max_len: usize, truncation_strategy: &TruncationStrategy, stride: usize) -> Vec<TokenizedInput> where S: AsRef<[(ST, ST)]>, ST: AsRef<str> + Sync, [src]

fn decode_list( &self, token_ids_list: Vec<Vec<i64>>, skip_special_tokens: bool, clean_up_tokenization_spaces: bool) -> Vec<String>[src]

impl Tokenizer<BertVocab> for BertTokenizer[src]

fn vocab(&self) -> &BertVocab[src]

fn tokenize_to_tokens(&self, initial_token: TokenRef<'_>) -> Vec<Token>[src]

fn convert_tokens_to_string(&self, tokens: Vec<String>) -> String[src]

fn build_input_with_special_tokens( &self, tokens_ids_with_offsets_1: TokenIdsWithOffsets, tokens_ids_with_offsets_2: Option<TokenIdsWithOffsets>) -> TokenIdsWithSpecialTokens[src]

fn tokenize<S: AsRef<str>>(&self, text: S) -> Vec<String>[src]

fn tokenize_with_offsets<S: AsRef<str>>(&self, text: S) -> TokensWithOffsets[src]

fn tokenize_list<S, ST>(&self, text_list: S) -> Vec<Vec<String>> where S: AsRef<[ST]>, ST: AsRef<str>, [src]

fn tokenize_list_with_offsets<S, ST>( &self, text_list: S) -> Vec<TokensWithOffsets> where S: AsRef<[ST]>, ST: AsRef<str>, [src]

fn convert_tokens_to_ids<S, ST>(&self, tokens: S) -> Vec<i64> where S: AsRef<[ST]>, ST: AsRef<str>, [src]

fn encode<S: AsRef<str>>( &self, text_1: S, text_2: Option<S>, max_len: usize, truncation_strategy: &TruncationStrategy, stride: usize) -> TokenizedInput[src]

fn encode_list<S, ST>( &self, text_list: S, max_len: usize, truncation_strategy: &TruncationStrategy, stride: usize) -> Vec<TokenizedInput> where S: AsRef<[ST]>, ST: AsRef<str>, [src]

fn encode_pair_list<S, ST>( &self, text_list: S, max_len: usize, truncation_strategy: &TruncationStrategy, stride: usize) -> Vec<TokenizedInput> where S: AsRef<[(ST, ST)]>, ST: AsRef<str>, [src]

fn decode_to_vec( &self, token_ids: Vec<i64>, skip_special_tokens: bool) -> Vec<String>[src]

fn decode( &self, token_ids: Vec<i64>, skip_special_tokens: bool, clean_up_tokenization_spaces: bool) -> String[src]

fn clean_up_tokenization(&self, input_string: String) -> String[src]

fn decode_list( &self, token_ids_list: Vec<Vec<i64>>, skip_special_tokens: bool, clean_up_tokenization_spaces: bool) -> Vec<String>[src]

Auto Trait Implementations

impl RefUnwindSafe for BertTokenizer

impl Send for BertTokenizer

impl Sync for BertTokenizer

impl Unpin for BertTokenizer

impl UnwindSafe for BertTokenizer

Blanket Implementations

impl<T> Any for T where T: 'static + ?Sized, [src]

pub fn type_id(&self) -> TypeId[src]

impl<T> Borrow<T> for T where T: ?Sized, [src]

pub fn borrow(&self) -> &T[src]

impl<T> BorrowMut<T> for T where T: ?Sized, [src]

pub fn borrow_mut(&mut self) -> &mut T[src]

impl<T> From<T> for T[src]

pub fn from(t: T) -> T[src]

impl<T, U> Into<U> for T where U: From<T>, [src]

pub fn into(self) -> U[src]

impl<T> Pointable for T

pub const ALIGN: usize

type Init = T

pub unsafe fn init(init: <T as Pointable>::Init) -> usize

pub unsafe fn deref<'a>(ptr: usize) -> &'a T

pub unsafe fn deref_mut<'a>(ptr: usize) -> &'a mut T

pub unsafe fn drop(ptr: usize)

impl<T, U> TryFrom<U> for T where U: Into<T>, [src]

type Error = Infallible

pub fn try_from(value: U) -> Result<T, <T as TryFrom<U>>::Error>[src]

impl<T, U> TryInto<U> for T where U: TryFrom<T>, [src]

type Error = <U as TryFrom<T>>::Error

pub fn try_into(self) -> Result<U, <U as TryFrom<T>>::Error>[src]

`impl BertTokenizer`[src]

`pub fn from_file( path: &str, lower_case: bool, strip_accents: bool ) -> Result<BertTokenizer, TokenizerError>`[src]

`pub fn from_existing_vocab( vocab: BertVocab, lower_case: bool, strip_accents: bool ) -> BertTokenizer`[src]

`impl MultiThreadedTokenizer<BertVocab> for BertTokenizer`[src]

`fn vocab(&self) -> &T`[src]

`fn tokenize_list_with_offsets<S, ST>( &self, text_list: S ) -> Vec<TokensWithOffsets> where S: AsRef<[ST]>, ST: AsRef<str> + Sync,` [src]

`fn tokenize_list<S, ST>(&self, text_list: S) -> Vec<Vec<String>> where S: AsRef<[ST]>, ST: AsRef<str> + Sync,` [src]

`fn encode_list<S, ST>( &self, text_list: S, max_len: usize, truncation_strategy: &TruncationStrategy, stride: usize ) -> Vec<TokenizedInput> where S: AsRef<[ST]>, ST: AsRef<str> + Sync,` [src]

`fn encode_pair_list<S, ST>( &self, text_list: S, max_len: usize, truncation_strategy: &TruncationStrategy, stride: usize ) -> Vec<TokenizedInput> where S: AsRef<[(ST, ST)]>, ST: AsRef<str> + Sync,` [src]

`fn decode_list( &self, token_ids_list: Vec<Vec<i64>>, skip_special_tokens: bool, clean_up_tokenization_spaces: bool ) -> Vec<String>`[src]

`impl Tokenizer<BertVocab> for BertTokenizer`[src]

`fn vocab(&self) -> &BertVocab`[src]

`fn tokenize_to_tokens(&self, initial_token: TokenRef<'_>) -> Vec<Token>`[src]

`fn convert_tokens_to_string(&self, tokens: Vec<String>) -> String`[src]

`fn build_input_with_special_tokens( &self, tokens_ids_with_offsets_1: TokenIdsWithOffsets, tokens_ids_with_offsets_2: Option<TokenIdsWithOffsets> ) -> TokenIdsWithSpecialTokens`[src]

`fn tokenize<S: AsRef<str>>(&self, text: S) -> Vec<String>`[src]

`fn tokenize_with_offsets<S: AsRef<str>>(&self, text: S) -> TokensWithOffsets`[src]

`fn tokenize_list<S, ST>(&self, text_list: S) -> Vec<Vec<String>> where S: AsRef<[ST]>, ST: AsRef<str>,` [src]

`fn tokenize_list_with_offsets<S, ST>( &self, text_list: S ) -> Vec<TokensWithOffsets> where S: AsRef<[ST]>, ST: AsRef<str>,` [src]

`fn convert_tokens_to_ids<S, ST>(&self, tokens: S) -> Vec<i64> where S: AsRef<[ST]>, ST: AsRef<str>,` [src]

`fn encode<S: AsRef<str>>( &self, text_1: S, text_2: Option<S>, max_len: usize, truncation_strategy: &TruncationStrategy, stride: usize ) -> TokenizedInput`[src]

`fn encode_list<S, ST>( &self, text_list: S, max_len: usize, truncation_strategy: &TruncationStrategy, stride: usize ) -> Vec<TokenizedInput> where S: AsRef<[ST]>, ST: AsRef<str>,` [src]

`fn encode_pair_list<S, ST>( &self, text_list: S, max_len: usize, truncation_strategy: &TruncationStrategy, stride: usize ) -> Vec<TokenizedInput> where S: AsRef<[(ST, ST)]>, ST: AsRef<str>,` [src]

`fn decode_to_vec( &self, token_ids: Vec<i64>, skip_special_tokens: bool ) -> Vec<String>`[src]

`fn decode( &self, token_ids: Vec<i64>, skip_special_tokens: bool, clean_up_tokenization_spaces: bool ) -> String`[src]

`fn clean_up_tokenization(&self, input_string: String) -> String`[src]

`fn decode_list( &self, token_ids_list: Vec<Vec<i64>>, skip_special_tokens: bool, clean_up_tokenization_spaces: bool ) -> Vec<String>`[src]

`impl RefUnwindSafe for BertTokenizer`

`impl Send for BertTokenizer`

`impl Sync for BertTokenizer`

`impl Unpin for BertTokenizer`

`impl UnwindSafe for BertTokenizer`

`impl<T> Any for T where T: 'static + ?Sized,` [src]

`pub fn type_id(&self) -> TypeId`[src]

`impl<T> Borrow<T> for T where T: ?Sized,` [src]

`pub fn borrow(&self) -> &T`[src]

`impl<T> BorrowMut<T> for T where T: ?Sized,` [src]

`pub fn borrow_mut(&mut self) -> &mut T`[src]

`impl<T> From<T> for T`[src]

`pub fn from(t: T) -> T`[src]

`impl<T, U> Into<U> for T where U: From<T>,` [src]

`pub fn into(self) -> U`[src]

`impl<T> Pointable for T`

`pub const ALIGN: usize`

`type Init = T`

`pub unsafe fn init(init: <T as Pointable>::Init) -> usize`

`pub unsafe fn deref<'a>(ptr: usize) -> &'a T`

`pub unsafe fn deref_mut<'a>(ptr: usize) -> &'a mut T`

`pub unsafe fn drop(ptr: usize)`

`impl<T, U> TryFrom<U> for T where U: Into<T>,` [src]

`type Error = Infallible`

`pub fn try_from(value: U) -> Result<T, <T as TryFrom<U>>::Error>`[src]

`impl<T, U> TryInto<U> for T where U: TryFrom<T>,` [src]

`type Error = <U as TryFrom<T>>::Error`

`pub fn try_into(self) -> Result<U, <U as TryFrom<T>>::Error>`[src]