Struct BaseTokenizer

Source

pub struct BaseTokenizer<T: Vocab> { /* private fields */ }

Expand description

§Base tokenizer

Base tokenizer performing:

whitespace tokenization
splitting on special characters
splitting on punctuation
splitting on CJK characters
(optional) lower casing
(optional) accent stripping

This tokenizer is used as a pre-tokenizer step in the BERT and GPT tokenizers.

Implementations§

Source §

impl<T: Vocab + Sync> BaseTokenizer<T>

Source

pub fn from_file_with_special_token_mapping<P: AsRef<Path>, S: AsRef<Path>>( path: P, lower_case: bool, strip_accents: bool, special_token_mapping_path: S, ) -> Result<BaseTokenizer<T>, TokenizerError>

Create a new instance of a BaseTokenizer Expects a vocabulary flat-file and special token mapping file as inputs.

§Parameters

path (&str): path to the vocabulary file (only used for special character splitting)
lower_case (bool): flag indicating if the text should be lower-cased as part of the tokenization
strip_accents (bool): flag indicating if accents should be stripped from the text
special_token_mapping_path (&str): path to a special token mapping file to overwrite default special tokens

§Example

use rust_tokenizers::tokenizer::{BaseTokenizer, Tokenizer};
use rust_tokenizers::vocab::BaseVocab;
let strip_accents = false;
let lower_case = false;
let tokenizer: BaseTokenizer<BaseVocab> = BaseTokenizer::from_file_with_special_token_mapping(
    "path/to/vocab/file",
    lower_case,
    strip_accents,
    "path/to/special/token/mapping/file",
)
.unwrap();

Source

pub fn from_file<P: AsRef<Path>>( path: P, lower_case: bool, strip_accents: bool, ) -> Result<BaseTokenizer<T>, TokenizerError>

Create a new instance of a BaseTokenizer Expects a vocabulary flat-file as an input.

§Parameters

path (&str): path to the vocabulary file (only used for special character splitting)
lower_case (bool): flag indicating if the text should be lower-cased as part of the tokenization
strip_accents (bool): flag indicating if accents should be stripped from the text

§Example

use rust_tokenizers::tokenizer::{BaseTokenizer, Tokenizer};
use rust_tokenizers::vocab::BaseVocab;
let strip_accents = false;
let lower_case = false;
let tokenizer: BaseTokenizer<BaseVocab> =
    BaseTokenizer::from_file("path/to/vocab/file", lower_case, strip_accents).unwrap();

Source

pub fn from_existing_vocab( vocab: T, lower_case: bool, strip_accents: bool, ) -> BaseTokenizer<T>

Create a new instance of a BaseTokenizer from an existing vocabulary

§Parameters

vocab (Vocab): Thread-safe reference to a vocabulary
lower_case (bool): flag indicating if the text should be lower-cased as part of the tokenization
strip_accents (bool): flag indicating if accents should be stripped from the text

§Example

use rust_tokenizers::tokenizer::{BaseTokenizer, Tokenizer};
use rust_tokenizers::vocab::{BaseVocab, Vocab};
let strip_accents = false;
let lower_case = false;
let base_vocab = BaseVocab::from_file("path/to/vocab/file").unwrap();

let tokenizer = BaseTokenizer::from_existing_vocab(base_vocab, lower_case, strip_accents);

Trait Implementations§

Source §

impl<T: Vocab + Sync + Send> MultiThreadedTokenizer<T> for BaseTokenizer<T>

Source §

fn vocab(&self) -> &T

returns a reference to the tokenizer vocabulary

Source §

fn tokenize_list_with_offsets<S>( &self, text_list: &[S], ) -> Vec<TokensWithOffsets>
where S: AsRef<str> + Sync,

Tokenize a list of strings (with multithreading), where each corresponds to for example a sentence, returns a vector of TokensWithOffsets containing the tokens and their offset information. This calls tokenize_with_offsets on the list provided. Read more

Source §

fn tokenize_list<S>(&self, text_list: &[S]) -> Vec<Vec<String>>
where S: AsRef<str> + Sync,

Multithreaded tokenization of a list of strings, returning tokens with offset information Read more

Source §

fn encode_list<S>( &self, text_list: &[S], max_len: usize, truncation_strategy: &TruncationStrategy, stride: usize, ) -> Vec<TokenizedInput>
where S: AsRef<str> + Sync,

Multithreaded encoding of a sequence of string-like texts (tokenization followed by encoding). Not that in contrast with encode optional second text, each text provided is encoded independently. Read more

Source §

fn encode_pair_list<S>( &self, text_list: &[(S, S)], max_len: usize, truncation_strategy: &TruncationStrategy, stride: usize, ) -> Vec<TokenizedInput>
where S: AsRef<str> + Sync,

Multithreaded ncoding of a sequence of string-like text pairs (tokenization followed by encoding). This combines with encode with the list processing of encode_list. Read more

Source §

fn decode_list( &self, token_ids_list: &[Vec<i64>], skip_special_tokens: bool, clean_up_tokenization_spaces: bool, ) -> Vec<String>

Multithreaded conversion a list of sequence of ids (integer) into a string, using the tokenizer and vocabulary with options to remove special tokens and clean up tokenization spaces. This calls decode for each provided sequence of ids Read more

Source §

impl<T: Vocab + Sync + Send> Tokenizer<T> for BaseTokenizer<T>

Source §

fn vocab(&self) -> &T

returns a reference to the tokenizer vocabulary

Source §

fn vocab_mut(&mut self) -> &mut T

returns a mutable reference to the tokenizer vocabulary

Source §

fn tokenize_to_tokens(&self, initial_token: TokenRef<'_>) -> Vec<Token>

Tokenize a TokenRef, returning a sequence of tokens Read more

Source §

fn tokenize(&self, text: &str) -> Vec<String>

Tokenize a string, returns a vector of tokens as strings. Use tokenize_with_offsets or tokenize_to_tokens to return offset information. Read more

Source §

fn tokenize_with_offsets(&self, text: &str) -> TokensWithOffsets

Tokenize a string, returning tokens with offset information Read more

Source §

fn tokenize_list<S>(&self, text_list: &[S]) -> Vec<Vec<String>>
where S: AsRef<str>,

Tokenize a list of strings, returning tokens with offset information Read more

Source §

fn tokenize_list_with_offsets<S>( &self, text_list: &[S], ) -> Vec<TokensWithOffsets>
where S: AsRef<str>,

Tokenize a list of strings, where each corresponds to for example a sentence, returns a vector of TokensWithOffsets containing the tokens and their offset information. This calls tokenize_with_offsets on the list provided. Read more

Source §

fn convert_tokens_to_ids<S>(&self, tokens: &[S]) -> Vec<i64>
where S: AsRef<str>,

Convert a slice of string-like to a vector ot token indices Read more

Source §

fn encode( &self, text_1: &str, text_2: Option<&str>, max_len: usize, truncation_strategy: &TruncationStrategy, stride: usize, ) -> TokenizedInput

Encode a string-like (tokenization followed by encoding) Read more

Source §

fn encode_list<S>( &self, text_list: &[S], max_len: usize, truncation_strategy: &TruncationStrategy, stride: usize, ) -> Vec<TokenizedInput>
where S: AsRef<str>,

Encode a sequence of string-like texts (tokenization followed by encoding). Not that in contrast with encode optional second text, each text provided is encoded independently. Read more

Source §

fn encode_pair_list<S>( &self, text_list: &[(S, S)], max_len: usize, truncation_strategy: &TruncationStrategy, stride: usize, ) -> Vec<TokenizedInput>
where S: AsRef<str>,

Encode a sequence of string-like text pairs (tokenization followed by encoding). This combines with encode with the list processing of encode_list. Read more

Source §

fn decode_to_vec( &self, token_ids: &[i64], skip_special_tokens: bool, ) -> Vec<String>

Decode a sequence of token indices to a sequence of Strings, optionally skipping special indices Read more

Source §

fn decode( &self, token_ids: &[i64], skip_special_tokens: bool, clean_up_tokenization_spaces: bool, ) -> String

Converts a sequence of ids (integer) into a string, using the tokenizer and vocabulary with options to remove special tokens and clean up tokenization spaces. Read more

Source §

fn convert_tokens_to_string(&self, tokens: Vec<String>) -> String

Converts a sequence of strings into a single string. This will clean-up artifacts from tokenization (for example sub ##word) and generate a single output string Read more

Source §

fn clean_up_tokenization(&self, input_string: String) -> String

Cleans-up tokenization artifacts (for example whitespace before punctuation) Read more

Source §

fn decode_list( &self, token_ids_list: &[Vec<i64>], skip_special_tokens: bool, clean_up_tokenization_spaces: bool, ) -> Vec<String>

Converts a list of sequence of ids (integer) into a string, using the tokenizer and vocabulary with options to remove special tokens and clean up tokenization spaces. This calls decode for each provided sequence of ids Read more

Source §

fn build_input_with_special_tokens( &self, tokens_ids_with_offsets_1: TokenIdsWithOffsets, tokens_ids_with_offsets_2: Option<TokenIdsWithOffsets>, ) -> TokenIdsWithSpecialTokens

Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and adding special tokens. Read more

Source §

fn add_tokens(&mut self, tokens: &[&str])

Add arbitrary tokens to the vocabulary. Read more

Source §

fn add_extra_ids(&mut self, num_extra_ids: i64)

Add arbitrary tokens to the vocabulary. Read more

Auto Trait Implementations§

§

impl<T> UnwindSafe for BaseTokenizer<T>
where T: UnwindSafe,

Blanket Implementations§

Source §

impl<T> Any for T
where T: 'static + ?Sized,

Source §

fn type_id(&self) -> TypeId

Gets the TypeId of self. Read more

Source §

impl<T> Borrow<T> for T
where T: ?Sized,

Source §

fn borrow(&self) -> &T

Immutably borrows from an owned value. Read more

Source §

impl<T> BorrowMut<T> for T
where T: ?Sized,

Source §

fn borrow_mut(&mut self) -> &mut T

Mutably borrows from an owned value. Read more

Source §

impl<T> From<T> for T

Source §

fn from(t: T) -> T

Returns the argument unchanged.

Source §

impl<T, U> Into for T
where U: From<T>,

Source §

fn into(self) -> U

Calls U::from(self).

That is, this conversion is whatever the implementation of From<T> for U chooses to do.

Source §

impl<T> IntoEither for T

Source §

fn into_either(self, into_left: bool) -> Either<Self, Self>

Converts self into a Left variant of Either<Self, Self> if into_left is true. Converts self into a Right variant of Either<Self, Self> otherwise. Read more

Source §

fn into_either_with<F>(self, into_left: F) -> Either<Self, Self>
where F: FnOnce(&Self) -> bool,

Converts self into a Left variant of Either<Self, Self> if into_left(&self) returns true. Converts self into a Right variant of Either<Self, Self> otherwise. Read more

Source §

impl<T> Pointable for T

Source §

const ALIGN: usize

The alignment of pointer.

Source §

type Init = T

The type for initializers.

Source §

unsafe fn init(init: <T as Pointable>::Init) -> usize

Initializes a with the given initializer. Read more

Source §

unsafe fn deref<'a>(ptr: usize) -> &'a T

Dereferences the given pointer. Read more

Source §

unsafe fn deref_mut<'a>(ptr: usize) -> &'a mut T

Mutably dereferences the given pointer. Read more

Source §

unsafe fn drop(ptr: usize)

Drops the object pointed to by the given pointer. Read more

Source §

impl<T, U> TryFrom for T
where U: Into<T>,

Source §

type Error = Infallible

The type returned in the event of a conversion error.

Source §

fn try_from(value: U) -> Result<T, <T as TryFrom>::Error>

Performs the conversion.

Source §

impl<T, U> TryInto for T
where U: TryFrom<T>,

Source §

type Error = >::Error

The type returned in the event of a conversion error.

Source §

fn try_into(self) -> Result<U, >::Error>

Performs the conversion.

Struct BaseTokenizerCopy item path

§Base tokenizer

Implementations§

impl<T: Vocab + Sync> BaseTokenizer<T>

pub fn from_file_with_special_token_mapping<P: AsRef<Path>, S: AsRef<Path>>( path: P, lower_case: bool, strip_accents: bool, special_token_mapping_path: S, ) -> Result<BaseTokenizer<T>, TokenizerError>

§Parameters

§Example

pub fn from_file<P: AsRef<Path>>( path: P, lower_case: bool, strip_accents: bool, ) -> Result<BaseTokenizer<T>, TokenizerError>

§Parameters

§Example

pub fn from_existing_vocab( vocab: T, lower_case: bool, strip_accents: bool, ) -> BaseTokenizer<T>

§Parameters

§Example

Trait Implementations§

impl<T: Vocab + Sync + Send> MultiThreadedTokenizer<T> for BaseTokenizer<T>

fn vocab(&self) -> &T

fn tokenize_list_with_offsets<S>( &self, text_list: &[S], ) -> Vec<TokensWithOffsets>where S: AsRef<str> + Sync,

fn tokenize_list<S>(&self, text_list: &[S]) -> Vec<Vec<String>>where S: AsRef<str> + Sync,

fn encode_list<S>( &self, text_list: &[S], max_len: usize, truncation_strategy: &TruncationStrategy, stride: usize, ) -> Vec<TokenizedInput>where S: AsRef<str> + Sync,

fn encode_pair_list<S>( &self, text_list: &[(S, S)], max_len: usize, truncation_strategy: &TruncationStrategy, stride: usize, ) -> Vec<TokenizedInput>where S: AsRef<str> + Sync,

fn decode_list( &self, token_ids_list: &[Vec<i64>], skip_special_tokens: bool, clean_up_tokenization_spaces: bool, ) -> Vec<String>

impl<T: Vocab + Sync + Send> Tokenizer<T> for BaseTokenizer<T>

fn vocab(&self) -> &T

fn vocab_mut(&mut self) -> &mut T

fn tokenize_to_tokens(&self, initial_token: TokenRef<'_>) -> Vec<Token>

fn tokenize(&self, text: &str) -> Vec<String>

fn tokenize_with_offsets(&self, text: &str) -> TokensWithOffsets

fn tokenize_list<S>(&self, text_list: &[S]) -> Vec<Vec<String>>where S: AsRef<str>,

fn tokenize_list_with_offsets<S>( &self, text_list: &[S], ) -> Vec<TokensWithOffsets>where S: AsRef<str>,

fn convert_tokens_to_ids<S>(&self, tokens: &[S]) -> Vec<i64>where S: AsRef<str>,

fn encode( &self, text_1: &str, text_2: Option<&str>, max_len: usize, truncation_strategy: &TruncationStrategy, stride: usize, ) -> TokenizedInput

fn encode_list<S>( &self, text_list: &[S], max_len: usize, truncation_strategy: &TruncationStrategy, stride: usize, ) -> Vec<TokenizedInput>where S: AsRef<str>,

fn encode_pair_list<S>( &self, text_list: &[(S, S)], max_len: usize, truncation_strategy: &TruncationStrategy, stride: usize, ) -> Vec<TokenizedInput>where S: AsRef<str>,

fn decode_to_vec( &self, token_ids: &[i64], skip_special_tokens: bool, ) -> Vec<String>

fn decode( &self, token_ids: &[i64], skip_special_tokens: bool, clean_up_tokenization_spaces: bool, ) -> String

fn convert_tokens_to_string(&self, tokens: Vec<String>) -> String

fn clean_up_tokenization(&self, input_string: String) -> String

fn decode_list( &self, token_ids_list: &[Vec<i64>], skip_special_tokens: bool, clean_up_tokenization_spaces: bool, ) -> Vec<String>

fn build_input_with_special_tokens( &self, tokens_ids_with_offsets_1: TokenIdsWithOffsets, tokens_ids_with_offsets_2: Option<TokenIdsWithOffsets>, ) -> TokenIdsWithSpecialTokens

fn add_tokens(&mut self, tokens: &[&str])

fn add_extra_ids(&mut self, num_extra_ids: i64)

Auto Trait Implementations§

impl<T> Freeze for BaseTokenizer<T>where T: Freeze,

impl<T> RefUnwindSafe for BaseTokenizer<T>where T: RefUnwindSafe,

impl<T> Send for BaseTokenizer<T>where T: Send,

impl<T> Sync for BaseTokenizer<T>where T: Sync,

impl<T> Unpin for BaseTokenizer<T>where T: Unpin,

impl<T> UnwindSafe for BaseTokenizer<T>where T: UnwindSafe,

Blanket Implementations§

impl<T> Any for Twhere T: 'static + ?Sized,

fn type_id(&self) -> TypeId

impl<T> Borrow<T> for Twhere T: ?Sized,

fn borrow(&self) -> &T

impl<T> BorrowMut<T> for Twhere T: ?Sized,

fn borrow_mut(&mut self) -> &mut T

impl<T> From<T> for T

fn from(t: T) -> T

impl<T, U> Into<U> for Twhere U: From<T>,

fn into(self) -> U

impl<T> IntoEither for T

fn into_either(self, into_left: bool) -> Either<Self, Self>

fn into_either_with<F>(self, into_left: F) -> Either<Self, Self>where F: FnOnce(&Self) -> bool,

impl<T> Pointable for T

const ALIGN: usize

type Init = T

unsafe fn init(init: <T as Pointable>::Init) -> usize

unsafe fn deref<'a>(ptr: usize) -> &'a T

unsafe fn deref_mut<'a>(ptr: usize) -> &'a mut T

unsafe fn drop(ptr: usize)

impl<T, U> TryFrom<U> for Twhere U: Into<T>,

type Error = Infallible

fn try_from(value: U) -> Result<T, <T as TryFrom<U>>::Error>

impl<T, U> TryInto<U> for Twhere U: TryFrom<T>,

type Error = <U as TryFrom<T>>::Error

fn try_into(self) -> Result<U, <U as TryFrom<T>>::Error>

Struct BaseTokenizer

fn tokenize_list_with_offsets<S>( &self, text_list: &[S], ) -> Vec<TokensWithOffsets>
where S: AsRef<str> + Sync,

fn tokenize_list<S>(&self, text_list: &[S]) -> Vec<Vec<String>>
where S: AsRef<str> + Sync,

fn encode_list<S>( &self, text_list: &[S], max_len: usize, truncation_strategy: &TruncationStrategy, stride: usize, ) -> Vec<TokenizedInput>
where S: AsRef<str> + Sync,

fn encode_pair_list<S>( &self, text_list: &[(S, S)], max_len: usize, truncation_strategy: &TruncationStrategy, stride: usize, ) -> Vec<TokenizedInput>
where S: AsRef<str> + Sync,

fn tokenize_list<S>(&self, text_list: &[S]) -> Vec<Vec<String>>
where S: AsRef<str>,

fn tokenize_list_with_offsets<S>( &self, text_list: &[S], ) -> Vec<TokensWithOffsets>
where S: AsRef<str>,

fn convert_tokens_to_ids<S>(&self, tokens: &[S]) -> Vec<i64>
where S: AsRef<str>,

fn encode_list<S>( &self, text_list: &[S], max_len: usize, truncation_strategy: &TruncationStrategy, stride: usize, ) -> Vec<TokenizedInput>
where S: AsRef<str>,

fn encode_pair_list<S>( &self, text_list: &[(S, S)], max_len: usize, truncation_strategy: &TruncationStrategy, stride: usize, ) -> Vec<TokenizedInput>
where S: AsRef<str>,

impl<T> Freeze for BaseTokenizer<T>
where T: Freeze,

impl<T> RefUnwindSafe for BaseTokenizer<T>
where T: RefUnwindSafe,

impl<T> Send for BaseTokenizer<T>
where T: Send,

impl<T> Sync for BaseTokenizer<T>
where T: Sync,

impl<T> Unpin for BaseTokenizer<T>
where T: Unpin,

impl<T> UnwindSafe for BaseTokenizer<T>
where T: UnwindSafe,

impl<T> Any for T
where T: 'static + ?Sized,

impl<T> Borrow<T> for T
where T: ?Sized,

impl<T> BorrowMut<T> for T
where T: ?Sized,

impl<T, U> Into<U> for T
where U: From<T>,

fn into_either_with<F>(self, into_left: F) -> Either<Self, Self>
where F: FnOnce(&Self) -> bool,

impl<T, U> TryFrom<U> for T
where U: Into<T>,

impl<T, U> TryInto<U> for T
where U: TryFrom<T>,