Struct rust_tokenizers::tokenizer::Gpt2Tokenizer

source · [−]

pub struct Gpt2Tokenizer { /* private fields */ }

Expand description

GPT2 tokenizer

GPT2 tokenizer performing:

splitting on special characters
whitespace splitting
(optional) lower casing
BPE tokenization

Implementations

source

impl Gpt2Tokenizer

source

pub fn from_file(
 vocab_path: &str,
 merges_path: &str,
 lower_case: bool
) -> Result<Gpt2Tokenizer, TokenizerError>

Create a new instance of a Gpt2Tokenizer Expects a vocabulary json file and a merges file as an input.

Parameters

vocab_path (&str): path to the vocabulary file
merges_path (&str): path to the merges file (use as part of the BPE encoding process)
lower_case (bool): flag indicating if the text should be lower-cased as part of the tokenization

pub fn from_existing_vocab_and_merges(
    vocab: Gpt2Vocab,
    merges: BpePairVocab,
    lower_case: bool
) -> Gpt2Tokenizer

Create a new instance of a Gpt2Tokenizer from an existing vocabulary and merges

Parameters

vocab (Gpt2Vocab): GPT-like vocabulary
merges (BpePairVocab): BPE pairs vocabulary
lower_case (bool): flag indicating if the text should be lower-cased as part of the tokenization

Example

use rust_tokenizers::tokenizer::{Gpt2Tokenizer, Tokenizer};
use rust_tokenizers::vocab::{BpePairVocab, Gpt2Vocab, Vocab};
let lower_case = false;
let vocab = Gpt2Vocab::from_file("path/to/vocab/file").unwrap();
let merges = BpePairVocab::from_file("path/to/merges/file").unwrap();

let tokenizer = Gpt2Tokenizer::from_existing_vocab_and_merges(vocab, merges, lower_case);

Trait Implementations

source

impl MultiThreadedTokenizer<Gpt2Vocab> for Gpt2Tokenizer

source

fn vocab(&self) -> &T

returns a reference to the tokenizer vocabulary

source

fn tokenize_list_with_offsets<S>(
 &self,
 text_list: &[S]
) -> Vec<TokensWithOffsets>ⓘNotable traits for Vec<u8, A>`impl<A> Write for Vec<u8, A> where A: Allocator,` where
 S: AsRef<str> + Sync,

Tokenize a list of strings (with multithreading), where each corresponds to for example a sentence, returns a vector of TokensWithOffsets containing the tokens and their offset information. This calls tokenize_with_offsets on the list provided. Read more

source

fn tokenize_list<S>(&self, text_list: &[S]) -> Vec<Vec<String>>ⓘNotable traits for Vec<u8, A>`impl<A> Write for Vec<u8, A> where A: Allocator,` where
S: AsRef<str> + Sync,

Multithreaded tokenization of a list of strings, returning tokens with offset information Read more

source

fn encode_list<S>(
 &self,
 text_list: &[S],
 max_len: usize,
 truncation_strategy: &TruncationStrategy,
 stride: usize
) -> Vec<TokenizedInput>ⓘNotable traits for Vec<u8, A>`impl<A> Write for Vec<u8, A> where A: Allocator,` where
 S: AsRef<str> + Sync,

Multithreaded encoding of a sequence of string-like texts (tokenization followed by encoding). Not that in contrast with encode optional second text, each text provided is encoded independently. Read more

source

fn encode_pair_list<S>(
 &self,
 text_list: &[(S, S)],
 max_len: usize,
 truncation_strategy: &TruncationStrategy,
 stride: usize
) -> Vec<TokenizedInput>ⓘNotable traits for Vec<u8, A>`impl<A> Write for Vec<u8, A> where A: Allocator,` where
 S: AsRef<str> + Sync,

Multithreaded ncoding of a sequence of string-like text pairs (tokenization followed by encoding). This combines with encode with the list processing of encode_list. Read more

source

fn decode_list(
 &self,
 token_ids_list: &[Vec<i64>],
 skip_special_tokens: bool,
 clean_up_tokenization_spaces: bool
) -> Vec<String>ⓘNotable traits for Vec<u8, A>`impl<A> Write for Vec<u8, A> where A: Allocator,`

Multithreaded conversion a list of sequence of ids (integer) into a string, using the tokenizer and vocabulary with options to remove special tokens and clean up tokenization spaces. This calls decode for each provided sequence of ids Read more

source

impl Tokenizer<Gpt2Vocab> for Gpt2Tokenizer

source

fn vocab(&self) -> &Gpt2Vocab

returns a reference to the tokenizer vocabulary

source

fn tokenize_to_tokens(&self, initial_token: TokenRef<'_>) -> Vec<Token>ⓘNotable traits for Vec<u8, A>`impl<A> Write for Vec<u8, A> where A: Allocator,`

Tokenize a TokenRef, returning a sequence of tokens Read more

source

fn convert_tokens_to_string(&self, tokens: Vec<String>) -> String

Converts a sequence of strings into a single string. This will clean-up artifacts from tokenization (for example sub ##word) and generate a single output string Read more

source

fn tokenize(&self, text: &str) -> Vec<String>ⓘNotable traits for Vec<u8, A>`impl<A> Write for Vec<u8, A> where A: Allocator,`

Tokenize a string, returns a vector of tokens as strings. Use tokenize_with_offsets or tokenize_to_tokens to return offset information. Read more

source

fn tokenize_with_offsets(&self, text: &str) -> TokensWithOffsets

Tokenize a string, returning tokens with offset information Read more

source

fn tokenize_list<S>(&self, text_list: &[S]) -> Vec<Vec<String>>ⓘNotable traits for Vec<u8, A>`impl<A> Write for Vec<u8, A> where A: Allocator,` where
S: AsRef<str>,

Tokenize a list of strings, returning tokens with offset information Read more

source

fn tokenize_list_with_offsets<S>(
 &self,
 text_list: &[S]
) -> Vec<TokensWithOffsets>ⓘNotable traits for Vec<u8, A>`impl<A> Write for Vec<u8, A> where A: Allocator,` where
 S: AsRef<str>,

Tokenize a list of strings, where each corresponds to for example a sentence, returns a vector of TokensWithOffsets containing the tokens and their offset information. This calls tokenize_with_offsets on the list provided. Read more

source

fn convert_tokens_to_ids<S>(&self, tokens: &[S]) -> Vec<i64>ⓘNotable traits for Vec<u8, A>`impl<A> Write for Vec<u8, A> where A: Allocator,` where
S: AsRef<str>,

Convert a slice of string-like to a vector ot token indices Read more

source

fn encode(
 &self,
 text_1: &str,
 text_2: Option<&str>,
 max_len: usize,
 truncation_strategy: &TruncationStrategy,
 stride: usize
) -> TokenizedInput

Encode a string-like (tokenization followed by encoding) Read more

source

fn encode_list<S>(
 &self,
 text_list: &[S],
 max_len: usize,
 truncation_strategy: &TruncationStrategy,
 stride: usize
) -> Vec<TokenizedInput>ⓘNotable traits for Vec<u8, A>`impl<A> Write for Vec<u8, A> where A: Allocator,` where
 S: AsRef<str>,

Encode a sequence of string-like texts (tokenization followed by encoding). Not that in contrast with encode optional second text, each text provided is encoded independently. Read more

source

fn encode_pair_list<S>(
 &self,
 text_list: &[(S, S)],
 max_len: usize,
 truncation_strategy: &TruncationStrategy,
 stride: usize
) -> Vec<TokenizedInput>ⓘNotable traits for Vec<u8, A>`impl<A> Write for Vec<u8, A> where A: Allocator,` where
 S: AsRef<str>,

Encode a sequence of string-like text pairs (tokenization followed by encoding). This combines with encode with the list processing of encode_list. Read more

source

fn decode_to_vec(
 &self,
 token_ids: &[i64 ],
 skip_special_tokens: bool
) -> Vec<String>ⓘNotable traits for Vec<u8, A>`impl<A> Write for Vec<u8, A> where A: Allocator,`

Decode a sequence of token indices to a sequence of Strings, optionally skipping special indices Read more

source

fn decode(
    &self,
    token_ids: &[i64 ],
    skip_special_tokens: bool,
    clean_up_tokenization_spaces: bool
) -> String

Converts a sequence of ids (integer) into a string, using the tokenizer and vocabulary with options to remove special tokens and clean up tokenization spaces. Read more

source

fn clean_up_tokenization(&self, input_string: String) -> String

Cleans-up tokenization artifacts (for example whitespace before punctuation) Read more

source

fn decode_list(
 &self,
 token_ids_list: &[Vec<i64>],
 skip_special_tokens: bool,
 clean_up_tokenization_spaces: bool
) -> Vec<String>ⓘNotable traits for Vec<u8, A>`impl<A> Write for Vec<u8, A> where A: Allocator,`

Converts a list of sequence of ids (integer) into a string, using the tokenizer and vocabulary with options to remove special tokens and clean up tokenization spaces. This calls decode for each provided sequence of ids Read more

source

fn build_input_with_special_tokens(
 &self,
 tokens_ids_with_offsets_1: TokenIdsWithOffsets,
 tokens_ids_with_offsets_2: Option<TokenIdsWithOffsets>
) -> TokenIdsWithSpecialTokens

Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and adding special tokens. Read more

Auto Trait Implementations

impl RefUnwindSafe for Gpt2Tokenizer

impl Send for Gpt2Tokenizer

impl Sync for Gpt2Tokenizer

impl Unpin for Gpt2Tokenizer

impl UnwindSafe for Gpt2Tokenizer

Blanket Implementations

source

impl<T> Any for T where
T: 'static + ?Sized,

source

fn type_id(&self) -> TypeId

Gets the TypeId of self. Read more

source

impl<T> Borrow<T> for T where
T: ?Sized,

const: unstable · source

fn borrow(&self) -> &T

Immutably borrows from an owned value. Read more

source

impl<T> BorrowMut<T> for T where
T: ?Sized,

const: unstable · source

fn borrow_mut(&mut self) -> &mut T

Mutably borrows from an owned value. Read more

source

impl<T> From<T> for T

const: unstable · source

fn from(t: T) -> T

Returns the argument unchanged.

source

impl<T, U> Into for T where
U: From<T>,

const: unstable · source

fn into(self) -> U

Calls U::from(self).

That is, this conversion is whatever the implementation of From<T> for U chooses to do.

impl<T> Pointable for T

const ALIGN: usize

The alignment of pointer.

type Init = T

The type for initializers.

unsafe fn init(init: <T as Pointable>::Init) -> usize

Initializes a with the given initializer. Read more

unsafe fn deref<'a>(ptr: usize) -> &'a T

Dereferences the given pointer. Read more

unsafe fn deref_mut<'a>(ptr: usize) -> &'a mut T

Mutably dereferences the given pointer. Read more

unsafe fn drop(ptr: usize)

Drops the object pointed to by the given pointer. Read more

source

impl<T, U> TryFrom for T where
U: Into<T>,

type Error = Infallible

The type returned in the event of a conversion error.

const: unstable · source

fn try_from(value: U) -> Result<T, <T as TryFrom>::Error>

Performs the conversion.

source

impl<T, U> TryInto for T where
U: TryFrom<T>,

type Error = >::Error

The type returned in the event of a conversion error.

const: unstable · source

fn try_into(self) -> Result<U, >::Error>

Performs the conversion.

Struct rust_tokenizers::tokenizer::Gpt2Tokenizer

Implementations

impl Gpt2Tokenizer

pub fn from_file( vocab_path: &str, merges_path: &str, lower_case: bool) -> Result<Gpt2Tokenizer, TokenizerError>

pub fn from_existing_vocab_and_merges( vocab: Gpt2Vocab, merges: BpePairVocab, lower_case: bool) -> Gpt2Tokenizer

Trait Implementations

impl MultiThreadedTokenizer<Gpt2Vocab> for Gpt2Tokenizer

fn vocab(&self) -> &T

fn tokenize_list_with_offsets<S>( &self, text_list: &[S]) -> Vec<TokensWithOffsets>ⓘNotable traits for Vec<u8, A>impl<A> Write for Vec<u8, A> where A: Allocator, where S: AsRef<str> + Sync,

fn tokenize_list<S>(&self, text_list: &[S]) -> Vec<Vec<String>>ⓘNotable traits for Vec<u8, A>impl<A> Write for Vec<u8, A> where A: Allocator, where S: AsRef<str> + Sync,

fn encode_list<S>( &self, text_list: &[S], max_len: usize, truncation_strategy: &TruncationStrategy, stride: usize) -> Vec<TokenizedInput>ⓘNotable traits for Vec<u8, A>impl<A> Write for Vec<u8, A> where A: Allocator, where S: AsRef<str> + Sync,

fn encode_pair_list<S>( &self, text_list: &[(S, S)], max_len: usize, truncation_strategy: &TruncationStrategy, stride: usize) -> Vec<TokenizedInput>ⓘNotable traits for Vec<u8, A>impl<A> Write for Vec<u8, A> where A: Allocator, where S: AsRef<str> + Sync,

fn decode_list( &self, token_ids_list: &[Vec<i64>], skip_special_tokens: bool, clean_up_tokenization_spaces: bool) -> Vec<String>ⓘNotable traits for Vec<u8, A>impl<A> Write for Vec<u8, A> where A: Allocator,

impl Tokenizer<Gpt2Vocab> for Gpt2Tokenizer

fn vocab(&self) -> &Gpt2Vocab

fn tokenize_to_tokens(&self, initial_token: TokenRef<'_>) -> Vec<Token>ⓘNotable traits for Vec<u8, A>impl<A> Write for Vec<u8, A> where A: Allocator,

fn convert_tokens_to_string(&self, tokens: Vec<String>) -> String

fn tokenize(&self, text: &str) -> Vec<String>ⓘNotable traits for Vec<u8, A>impl<A> Write for Vec<u8, A> where A: Allocator,

fn tokenize_with_offsets(&self, text: &str) -> TokensWithOffsets

fn tokenize_list<S>(&self, text_list: &[S]) -> Vec<Vec<String>>ⓘNotable traits for Vec<u8, A>impl<A> Write for Vec<u8, A> where A: Allocator, where S: AsRef<str>,

fn tokenize_list_with_offsets<S>( &self, text_list: &[S]) -> Vec<TokensWithOffsets>ⓘNotable traits for Vec<u8, A>impl<A> Write for Vec<u8, A> where A: Allocator, where S: AsRef<str>,

fn convert_tokens_to_ids<S>(&self, tokens: &[S]) -> Vec<i64>ⓘNotable traits for Vec<u8, A>impl<A> Write for Vec<u8, A> where A: Allocator, where S: AsRef<str>,

fn encode( &self, text_1: &str, text_2: Option<&str>, max_len: usize, truncation_strategy: &TruncationStrategy, stride: usize) -> TokenizedInput

fn encode_list<S>( &self, text_list: &[S], max_len: usize, truncation_strategy: &TruncationStrategy, stride: usize) -> Vec<TokenizedInput>ⓘNotable traits for Vec<u8, A>impl<A> Write for Vec<u8, A> where A: Allocator, where S: AsRef<str>,

fn encode_pair_list<S>( &self, text_list: &[(S, S)], max_len: usize, truncation_strategy: &TruncationStrategy, stride: usize) -> Vec<TokenizedInput>ⓘNotable traits for Vec<u8, A>impl<A> Write for Vec<u8, A> where A: Allocator, where S: AsRef<str>,

fn decode_to_vec( &self, token_ids: &[i64], skip_special_tokens: bool) -> Vec<String>ⓘNotable traits for Vec<u8, A>impl<A> Write for Vec<u8, A> where A: Allocator,

fn decode( &self, token_ids: &[i64], skip_special_tokens: bool, clean_up_tokenization_spaces: bool) -> String

fn clean_up_tokenization(&self, input_string: String) -> String

fn decode_list( &self, token_ids_list: &[Vec<i64>], skip_special_tokens: bool, clean_up_tokenization_spaces: bool) -> Vec<String>ⓘNotable traits for Vec<u8, A>impl<A> Write for Vec<u8, A> where A: Allocator,

fn build_input_with_special_tokens( &self, tokens_ids_with_offsets_1: TokenIdsWithOffsets, tokens_ids_with_offsets_2: Option<TokenIdsWithOffsets>) -> TokenIdsWithSpecialTokens

Auto Trait Implementations

impl RefUnwindSafe for Gpt2Tokenizer

impl Send for Gpt2Tokenizer

impl Sync for Gpt2Tokenizer

impl Unpin for Gpt2Tokenizer

impl UnwindSafe for Gpt2Tokenizer

Blanket Implementations

impl<T> Any for T where T: 'static + ?Sized,

fn type_id(&self) -> TypeId

impl<T> Borrow<T> for T where T: ?Sized,

fn borrow(&self) -> &T

impl<T> BorrowMut<T> for T where T: ?Sized,

fn borrow_mut(&mut self) -> &mut T

impl<T> From<T> for T

fn from(t: T) -> T

impl<T, U> Into<U> for T where U: From<T>,

fn into(self) -> U

impl<T> Pointable for T

const ALIGN: usize

type Init = T

unsafe fn init(init: <T as Pointable>::Init) -> usize

unsafe fn deref<'a>(ptr: usize) -> &'a T

unsafe fn deref_mut<'a>(ptr: usize) -> &'a mut T

unsafe fn drop(ptr: usize)

impl<T, U> TryFrom<U> for T where U: Into<T>,

type Error = Infallible

fn try_from(value: U) -> Result<T, <T as TryFrom<U>>::Error>

impl<T, U> TryInto<U> for T where U: TryFrom<T>,

type Error = <U as TryFrom<T>>::Error

fn try_into(self) -> Result<U, <U as TryFrom<T>>::Error>

pub fn from_file(
vocab_path: &str,
merges_path: &str,
lower_case: bool
) -> Result<Gpt2Tokenizer, TokenizerError>

pub fn from_existing_vocab_and_merges(
vocab: Gpt2Vocab,
merges: BpePairVocab,
lower_case: bool
) -> Gpt2Tokenizer

fn tokenize_list_with_offsets<S>(
&self,
text_list: &[S]
) -> Vec<TokensWithOffsets>ⓘNotable traits for Vec<u8, A>`impl<A> Write for Vec<u8, A> where A: Allocator,` where
S: AsRef<str> + Sync,

fn tokenize_list<S>(&self, text_list: &[S]) -> Vec<Vec<String>>ⓘNotable traits for Vec<u8, A>`impl<A> Write for Vec<u8, A> where A: Allocator,` where
S: AsRef<str> + Sync,

fn encode_list<S>(
&self,
text_list: &[S],
max_len: usize,
truncation_strategy: &TruncationStrategy,
stride: usize
) -> Vec<TokenizedInput>ⓘNotable traits for Vec<u8, A>`impl<A> Write for Vec<u8, A> where A: Allocator,` where
S: AsRef<str> + Sync,

fn encode_pair_list<S>(
&self,
text_list: &[(S, S)],
max_len: usize,
truncation_strategy: &TruncationStrategy,
stride: usize
) -> Vec<TokenizedInput>ⓘNotable traits for Vec<u8, A>`impl<A> Write for Vec<u8, A> where A: Allocator,` where
S: AsRef<str> + Sync,

fn decode_list(
&self,
token_ids_list: &[Vec<i64>],
skip_special_tokens: bool,
clean_up_tokenization_spaces: bool
) -> Vec<String>ⓘNotable traits for Vec<u8, A>`impl<A> Write for Vec<u8, A> where A: Allocator,`

fn tokenize_to_tokens(&self, initial_token: TokenRef<'_>) -> Vec<Token>ⓘNotable traits for Vec<u8, A>`impl<A> Write for Vec<u8, A> where A: Allocator,`

fn tokenize(&self, text: &str) -> Vec<String>ⓘNotable traits for Vec<u8, A>`impl<A> Write for Vec<u8, A> where A: Allocator,`

fn tokenize_list<S>(&self, text_list: &[S]) -> Vec<Vec<String>>ⓘNotable traits for Vec<u8, A>`impl<A> Write for Vec<u8, A> where A: Allocator,` where
S: AsRef<str>,

fn tokenize_list_with_offsets<S>(
&self,
text_list: &[S]
) -> Vec<TokensWithOffsets>ⓘNotable traits for Vec<u8, A>`impl<A> Write for Vec<u8, A> where A: Allocator,` where
S: AsRef<str>,

fn convert_tokens_to_ids<S>(&self, tokens: &[S]) -> Vec<i64>ⓘNotable traits for Vec<u8, A>`impl<A> Write for Vec<u8, A> where A: Allocator,` where
S: AsRef<str>,

fn encode(
&self,
text_1: &str,
text_2: Option<&str>,
max_len: usize,
truncation_strategy: &TruncationStrategy,
stride: usize
) -> TokenizedInput

fn encode_list<S>(
&self,
text_list: &[S],
max_len: usize,
truncation_strategy: &TruncationStrategy,
stride: usize
) -> Vec<TokenizedInput>ⓘNotable traits for Vec<u8, A>`impl<A> Write for Vec<u8, A> where A: Allocator,` where
S: AsRef<str>,

fn encode_pair_list<S>(
&self,
text_list: &[(S, S)],
max_len: usize,
truncation_strategy: &TruncationStrategy,
stride: usize
) -> Vec<TokenizedInput>ⓘNotable traits for Vec<u8, A>`impl<A> Write for Vec<u8, A> where A: Allocator,` where
S: AsRef<str>,

fn decode_to_vec(
&self,
token_ids: &[i64 ],
skip_special_tokens: bool
) -> Vec<String>ⓘNotable traits for Vec<u8, A>`impl<A> Write for Vec<u8, A> where A: Allocator,`

fn decode(
&self,
token_ids: &[i64 ],
skip_special_tokens: bool,
clean_up_tokenization_spaces: bool
) -> String

fn decode_list(
&self,
token_ids_list: &[Vec<i64>],
skip_special_tokens: bool,
clean_up_tokenization_spaces: bool
) -> Vec<String>ⓘNotable traits for Vec<u8, A>`impl<A> Write for Vec<u8, A> where A: Allocator,`

fn build_input_with_special_tokens(
&self,
tokens_ids_with_offsets_1: TokenIdsWithOffsets,
tokens_ids_with_offsets_2: Option<TokenIdsWithOffsets>
) -> TokenIdsWithSpecialTokens

impl<T> Any for T where
T: 'static + ?Sized,

impl<T> Borrow<T> for T where
T: ?Sized,

impl<T> BorrowMut<T> for T where
T: ?Sized,

impl<T, U> Into<U> for T where
U: From<T>,

impl<T, U> TryFrom<U> for T where
U: Into<T>,

impl<T, U> TryInto<U> for T where
U: TryFrom<T>,