[−][src]Struct rust_tokenizers::tokenizer::BaseTokenizer

pub struct BaseTokenizer<T: Vocab> { /* fields omitted */ }

Base tokenizer

Base tokenizer performing:

whitespace tokenization
splitting on special characters
splitting on punctuation
splitting on CJK characters
(optional) lower casing
(optional) accent stripping

This tokenizer is used as a pre-tokenizer step in the BERT and GPT tokenizers.

Implementations

`impl<T: Vocab + Sync + Send> BaseTokenizer<T>`[src]

`pub fn from_file( path: &str, lower_case: bool, strip_accents: bool ) -> Result<BaseTokenizer<T>, TokenizerError>`[src]

Create a new instance of a BaseTokenizer Expects a vocabulary flat-file as an input.

Parameters

path (&str): path to the vocabulary file (only used for special character splitting)
lower_case (bool): flag indicating if the text should be lower-cased as part of the tokenization
strip_accents (bool): flag indicating if accents should be stripped from the text

Example

use rust_tokenizers::tokenizer::{BaseTokenizer, Tokenizer};
use rust_tokenizers::vocab::BaseVocab;
let strip_accents = false;
let lower_case = false;
let tokenizer: BaseTokenizer<BaseVocab> =
    BaseTokenizer::from_file("path/to/vocab/file", lower_case, strip_accents).unwrap();

`pub fn from_existing_vocab( vocab: T, lower_case: bool, strip_accents: bool ) -> BaseTokenizer<T>`[src]

Create a new instance of a BaseTokenizer from an existing vocabulary

Parameters

vocab (Vocab): Thread-safe reference to a vocabulary
lower_case (bool): flag indicating if the text should be lower-cased as part of the tokenization
strip_accents (bool): flag indicating if accents should be stripped from the text

Example

use rust_tokenizers::tokenizer::{BaseTokenizer, Tokenizer};
use rust_tokenizers::vocab::{BaseVocab, Vocab};
let strip_accents = false;
let lower_case = false;
let base_vocab = BaseVocab::from_file("path/to/vocab/file").unwrap();

let tokenizer = BaseTokenizer::from_existing_vocab(base_vocab, lower_case, strip_accents);

Trait Implementations

`impl<T: Vocab + Sync + Send> MultiThreadedTokenizer<T> for BaseTokenizer<T>`[src]

`fn vocab(&self) -> &T`[src]

`fn tokenize_list_with_offsets<S, ST>( &self, text_list: S ) -> Vec<TokensWithOffsets> where S: AsRef<[ST]>, ST: AsRef<str> + Sync,` [src]

`fn tokenize_list<S, ST>(&self, text_list: S) -> Vec<Vec<String>> where S: AsRef<[ST]>, ST: AsRef<str> + Sync,` [src]

`fn encode_list<S, ST>( &self, text_list: S, max_len: usize, truncation_strategy: &TruncationStrategy, stride: usize ) -> Vec<TokenizedInput> where S: AsRef<[ST]>, ST: AsRef<str> + Sync,` [src]

`fn encode_pair_list<S, ST>( &self, text_list: S, max_len: usize, truncation_strategy: &TruncationStrategy, stride: usize ) -> Vec<TokenizedInput> where S: AsRef<[(ST, ST)]>, ST: AsRef<str> + Sync,` [src]

`fn decode_list( &self, token_ids_list: Vec<Vec<i64>>, skip_special_tokens: bool, clean_up_tokenization_spaces: bool ) -> Vec<String>`[src]

`impl<T: Vocab + Sync + Send> Tokenizer<T> for BaseTokenizer<T>`[src]

`fn vocab(&self) -> &T`[src]

`fn tokenize_to_tokens(&self, initial_token: TokenRef<'_>) -> Vec<Token>`[src]

`fn tokenize<S: AsRef<str>>(&self, text: S) -> Vec<String>`[src]

`fn tokenize_with_offsets<S: AsRef<str>>(&self, text: S) -> TokensWithOffsets`[src]

`fn tokenize_list<S, ST>(&self, text_list: S) -> Vec<Vec<String>> where S: AsRef<[ST]>, ST: AsRef<str>,` [src]

`fn tokenize_list_with_offsets<S, ST>( &self, text_list: S ) -> Vec<TokensWithOffsets> where S: AsRef<[ST]>, ST: AsRef<str>,` [src]

`fn convert_tokens_to_ids<S, ST>(&self, tokens: S) -> Vec<i64> where S: AsRef<[ST]>, ST: AsRef<str>,` [src]

`fn encode<S: AsRef<str>>( &self, text_1: S, text_2: Option<S>, max_len: usize, truncation_strategy: &TruncationStrategy, stride: usize ) -> TokenizedInput`[src]

`fn encode_list<S, ST>( &self, text_list: S, max_len: usize, truncation_strategy: &TruncationStrategy, stride: usize ) -> Vec<TokenizedInput> where S: AsRef<[ST]>, ST: AsRef<str>,` [src]

`fn encode_pair_list<S, ST>( &self, text_list: S, max_len: usize, truncation_strategy: &TruncationStrategy, stride: usize ) -> Vec<TokenizedInput> where S: AsRef<[(ST, ST)]>, ST: AsRef<str>,` [src]

`fn decode_to_vec( &self, token_ids: Vec<i64>, skip_special_tokens: bool ) -> Vec<String>`[src]

`fn decode( &self, token_ids: Vec<i64>, skip_special_tokens: bool, clean_up_tokenization_spaces: bool ) -> String`[src]

`fn convert_tokens_to_string(&self, tokens: Vec<String>) -> String`[src]

`fn clean_up_tokenization(&self, input_string: String) -> String`[src]

`fn decode_list( &self, token_ids_list: Vec<Vec<i64>>, skip_special_tokens: bool, clean_up_tokenization_spaces: bool ) -> Vec<String>`[src]

`fn build_input_with_special_tokens( &self, tokens_ids_with_offsets_1: TokenIdsWithOffsets, tokens_ids_with_offsets_2: Option<TokenIdsWithOffsets> ) -> TokenIdsWithSpecialTokens`[src]

Auto Trait Implementations

`impl<T> RefUnwindSafe for BaseTokenizer<T> where T: RefUnwindSafe,`

`impl<T> Send for BaseTokenizer<T> where T: Send,`

`impl<T> Sync for BaseTokenizer<T> where T: Sync,`

`impl<T> Unpin for BaseTokenizer<T> where T: Unpin,`

`impl<T> UnwindSafe for BaseTokenizer<T> where T: UnwindSafe,`

Blanket Implementations

`impl<T> Any for T where T: 'static + ?Sized,` [src]

`pub fn type_id(&self) -> TypeId`[src]

`impl<T> Borrow<T> for T where T: ?Sized,` [src]

`pub fn borrow(&self) -> &T`[src]

`impl<T> BorrowMut<T> for T where T: ?Sized,` [src]

`pub fn borrow_mut(&mut self) -> &mut T`[src]

`impl<T> From<T> for T`[src]

`pub fn from(t: T) -> T`[src]

`impl<T, U> Into for T where U: From<T>,` [src]

`pub fn into(self) -> U`[src]

`impl<T> Pointable for T`

`pub const ALIGN: usize`

`type Init = T`

The type for initializers.

`pub unsafe fn init(init: <T as Pointable>::Init) -> usize`

`pub unsafe fn deref<'a>(ptr: usize) -> &'a T`

`pub unsafe fn deref_mut<'a>(ptr: usize) -> &'a mut T`

`pub unsafe fn drop(ptr: usize)`

`impl<T, U> TryFrom for T where U: Into<T>,` [src]

`type Error = Infallible`

The type returned in the event of a conversion error.

`pub fn try_from(value: U) -> Result<T, <T as TryFrom>::Error>`[src]

`impl<T, U> TryInto for T where U: TryFrom<T>,` [src]

`type Error = >::Error`

The type returned in the event of a conversion error.

`pub fn try_into(self) -> Result<U, >::Error>`[src]

[−][src]Struct rust_tokenizers::tokenizer::BaseTokenizer

Base tokenizer

Implementations

impl<T: Vocab + Sync + Send> BaseTokenizer<T>[src]

pub fn from_file( path: &str, lower_case: bool, strip_accents: bool) -> Result<BaseTokenizer<T>, TokenizerError>[src]

Parameters

Example

pub fn from_existing_vocab( vocab: T, lower_case: bool, strip_accents: bool) -> BaseTokenizer<T>[src]

Parameters

Example

Trait Implementations

impl<T: Vocab + Sync + Send> MultiThreadedTokenizer<T> for BaseTokenizer<T>[src]

fn vocab(&self) -> &T[src]

fn tokenize_list_with_offsets<S, ST>( &self, text_list: S) -> Vec<TokensWithOffsets> where S: AsRef<[ST]>, ST: AsRef<str> + Sync, [src]

fn tokenize_list<S, ST>(&self, text_list: S) -> Vec<Vec<String>> where S: AsRef<[ST]>, ST: AsRef<str> + Sync, [src]

fn encode_list<S, ST>( &self, text_list: S, max_len: usize, truncation_strategy: &TruncationStrategy, stride: usize) -> Vec<TokenizedInput> where S: AsRef<[ST]>, ST: AsRef<str> + Sync, [src]

fn encode_pair_list<S, ST>( &self, text_list: S, max_len: usize, truncation_strategy: &TruncationStrategy, stride: usize) -> Vec<TokenizedInput> where S: AsRef<[(ST, ST)]>, ST: AsRef<str> + Sync, [src]

fn decode_list( &self, token_ids_list: Vec<Vec<i64>>, skip_special_tokens: bool, clean_up_tokenization_spaces: bool) -> Vec<String>[src]

impl<T: Vocab + Sync + Send> Tokenizer<T> for BaseTokenizer<T>[src]

fn vocab(&self) -> &T[src]

fn tokenize_to_tokens(&self, initial_token: TokenRef<'_>) -> Vec<Token>[src]

fn tokenize<S: AsRef<str>>(&self, text: S) -> Vec<String>[src]

fn tokenize_with_offsets<S: AsRef<str>>(&self, text: S) -> TokensWithOffsets[src]

fn tokenize_list<S, ST>(&self, text_list: S) -> Vec<Vec<String>> where S: AsRef<[ST]>, ST: AsRef<str>, [src]

fn tokenize_list_with_offsets<S, ST>( &self, text_list: S) -> Vec<TokensWithOffsets> where S: AsRef<[ST]>, ST: AsRef<str>, [src]

fn convert_tokens_to_ids<S, ST>(&self, tokens: S) -> Vec<i64> where S: AsRef<[ST]>, ST: AsRef<str>, [src]

fn encode<S: AsRef<str>>( &self, text_1: S, text_2: Option<S>, max_len: usize, truncation_strategy: &TruncationStrategy, stride: usize) -> TokenizedInput[src]

fn encode_list<S, ST>( &self, text_list: S, max_len: usize, truncation_strategy: &TruncationStrategy, stride: usize) -> Vec<TokenizedInput> where S: AsRef<[ST]>, ST: AsRef<str>, [src]

fn encode_pair_list<S, ST>( &self, text_list: S, max_len: usize, truncation_strategy: &TruncationStrategy, stride: usize) -> Vec<TokenizedInput> where S: AsRef<[(ST, ST)]>, ST: AsRef<str>, [src]

fn decode_to_vec( &self, token_ids: Vec<i64>, skip_special_tokens: bool) -> Vec<String>[src]

fn decode( &self, token_ids: Vec<i64>, skip_special_tokens: bool, clean_up_tokenization_spaces: bool) -> String[src]

fn convert_tokens_to_string(&self, tokens: Vec<String>) -> String[src]

fn clean_up_tokenization(&self, input_string: String) -> String[src]

fn decode_list( &self, token_ids_list: Vec<Vec<i64>>, skip_special_tokens: bool, clean_up_tokenization_spaces: bool) -> Vec<String>[src]

fn build_input_with_special_tokens( &self, tokens_ids_with_offsets_1: TokenIdsWithOffsets, tokens_ids_with_offsets_2: Option<TokenIdsWithOffsets>) -> TokenIdsWithSpecialTokens[src]

Auto Trait Implementations

impl<T> RefUnwindSafe for BaseTokenizer<T> where T: RefUnwindSafe,

impl<T> Send for BaseTokenizer<T> where T: Send,

impl<T> Sync for BaseTokenizer<T> where T: Sync,

impl<T> Unpin for BaseTokenizer<T> where T: Unpin,

impl<T> UnwindSafe for BaseTokenizer<T> where T: UnwindSafe,

Blanket Implementations

impl<T> Any for T where T: 'static + ?Sized, [src]

pub fn type_id(&self) -> TypeId[src]

impl<T> Borrow<T> for T where T: ?Sized, [src]

pub fn borrow(&self) -> &T[src]

impl<T> BorrowMut<T> for T where T: ?Sized, [src]

pub fn borrow_mut(&mut self) -> &mut T[src]

impl<T> From<T> for T[src]

pub fn from(t: T) -> T[src]

impl<T, U> Into<U> for T where U: From<T>, [src]

pub fn into(self) -> U[src]

impl<T> Pointable for T

pub const ALIGN: usize

type Init = T

pub unsafe fn init(init: <T as Pointable>::Init) -> usize

pub unsafe fn deref<'a>(ptr: usize) -> &'a T

pub unsafe fn deref_mut<'a>(ptr: usize) -> &'a mut T

pub unsafe fn drop(ptr: usize)

impl<T, U> TryFrom<U> for T where U: Into<T>, [src]

type Error = Infallible

pub fn try_from(value: U) -> Result<T, <T as TryFrom<U>>::Error>[src]

impl<T, U> TryInto<U> for T where U: TryFrom<T>, [src]

type Error = <U as TryFrom<T>>::Error

pub fn try_into(self) -> Result<U, <U as TryFrom<T>>::Error>[src]

`impl<T: Vocab + Sync + Send> BaseTokenizer<T>`[src]

`pub fn from_file( path: &str, lower_case: bool, strip_accents: bool ) -> Result<BaseTokenizer<T>, TokenizerError>`[src]

`pub fn from_existing_vocab( vocab: T, lower_case: bool, strip_accents: bool ) -> BaseTokenizer<T>`[src]

`impl<T: Vocab + Sync + Send> MultiThreadedTokenizer<T> for BaseTokenizer<T>`[src]

`fn vocab(&self) -> &T`[src]

`fn tokenize_list_with_offsets<S, ST>( &self, text_list: S ) -> Vec<TokensWithOffsets> where S: AsRef<[ST]>, ST: AsRef<str> + Sync,` [src]

`fn tokenize_list<S, ST>(&self, text_list: S) -> Vec<Vec<String>> where S: AsRef<[ST]>, ST: AsRef<str> + Sync,` [src]

`fn encode_list<S, ST>( &self, text_list: S, max_len: usize, truncation_strategy: &TruncationStrategy, stride: usize ) -> Vec<TokenizedInput> where S: AsRef<[ST]>, ST: AsRef<str> + Sync,` [src]

`fn encode_pair_list<S, ST>( &self, text_list: S, max_len: usize, truncation_strategy: &TruncationStrategy, stride: usize ) -> Vec<TokenizedInput> where S: AsRef<[(ST, ST)]>, ST: AsRef<str> + Sync,` [src]

`fn decode_list( &self, token_ids_list: Vec<Vec<i64>>, skip_special_tokens: bool, clean_up_tokenization_spaces: bool ) -> Vec<String>`[src]

`impl<T: Vocab + Sync + Send> Tokenizer<T> for BaseTokenizer<T>`[src]

`fn vocab(&self) -> &T`[src]

`fn tokenize_to_tokens(&self, initial_token: TokenRef<'_>) -> Vec<Token>`[src]

`fn tokenize<S: AsRef<str>>(&self, text: S) -> Vec<String>`[src]

`fn tokenize_with_offsets<S: AsRef<str>>(&self, text: S) -> TokensWithOffsets`[src]

`fn tokenize_list<S, ST>(&self, text_list: S) -> Vec<Vec<String>> where S: AsRef<[ST]>, ST: AsRef<str>,` [src]

`fn tokenize_list_with_offsets<S, ST>( &self, text_list: S ) -> Vec<TokensWithOffsets> where S: AsRef<[ST]>, ST: AsRef<str>,` [src]

`fn convert_tokens_to_ids<S, ST>(&self, tokens: S) -> Vec<i64> where S: AsRef<[ST]>, ST: AsRef<str>,` [src]

`fn encode<S: AsRef<str>>( &self, text_1: S, text_2: Option<S>, max_len: usize, truncation_strategy: &TruncationStrategy, stride: usize ) -> TokenizedInput`[src]

`fn encode_list<S, ST>( &self, text_list: S, max_len: usize, truncation_strategy: &TruncationStrategy, stride: usize ) -> Vec<TokenizedInput> where S: AsRef<[ST]>, ST: AsRef<str>,` [src]

`fn encode_pair_list<S, ST>( &self, text_list: S, max_len: usize, truncation_strategy: &TruncationStrategy, stride: usize ) -> Vec<TokenizedInput> where S: AsRef<[(ST, ST)]>, ST: AsRef<str>,` [src]

`fn decode_to_vec( &self, token_ids: Vec<i64>, skip_special_tokens: bool ) -> Vec<String>`[src]

`fn decode( &self, token_ids: Vec<i64>, skip_special_tokens: bool, clean_up_tokenization_spaces: bool ) -> String`[src]

`fn convert_tokens_to_string(&self, tokens: Vec<String>) -> String`[src]

`fn clean_up_tokenization(&self, input_string: String) -> String`[src]

`fn decode_list( &self, token_ids_list: Vec<Vec<i64>>, skip_special_tokens: bool, clean_up_tokenization_spaces: bool ) -> Vec<String>`[src]

`fn build_input_with_special_tokens( &self, tokens_ids_with_offsets_1: TokenIdsWithOffsets, tokens_ids_with_offsets_2: Option<TokenIdsWithOffsets> ) -> TokenIdsWithSpecialTokens`[src]

`impl<T> RefUnwindSafe for BaseTokenizer<T> where T: RefUnwindSafe,`

`impl<T> Send for BaseTokenizer<T> where T: Send,`

`impl<T> Sync for BaseTokenizer<T> where T: Sync,`

`impl<T> Unpin for BaseTokenizer<T> where T: Unpin,`

`impl<T> UnwindSafe for BaseTokenizer<T> where T: UnwindSafe,`

`impl<T> Any for T where T: 'static + ?Sized,` [src]

`pub fn type_id(&self) -> TypeId`[src]

`impl<T> Borrow<T> for T where T: ?Sized,` [src]

`pub fn borrow(&self) -> &T`[src]

`impl<T> BorrowMut<T> for T where T: ?Sized,` [src]

`pub fn borrow_mut(&mut self) -> &mut T`[src]

`impl<T> From<T> for T`[src]

`pub fn from(t: T) -> T`[src]

`impl<T, U> Into<U> for T where U: From<T>,` [src]

`pub fn into(self) -> U`[src]

`impl<T> Pointable for T`

`pub const ALIGN: usize`

`type Init = T`

`pub unsafe fn init(init: <T as Pointable>::Init) -> usize`

`pub unsafe fn deref<'a>(ptr: usize) -> &'a T`

`pub unsafe fn deref_mut<'a>(ptr: usize) -> &'a mut T`

`pub unsafe fn drop(ptr: usize)`

`impl<T, U> TryFrom<U> for T where U: Into<T>,` [src]

`type Error = Infallible`

`pub fn try_from(value: U) -> Result<T, <T as TryFrom<U>>::Error>`[src]

`impl<T, U> TryInto<U> for T where U: TryFrom<T>,` [src]

`type Error = <U as TryFrom<T>>::Error`

`pub fn try_into(self) -> Result<U, <U as TryFrom<T>>::Error>`[src]