Struct Tokenizer

Source

pub struct Tokenizer {
    pub bpe: BytePairEncoding,
    pub pre: Option<Pretokenizer>,
}

Expand description

A byte-pair encoding tokenizer that supports a pre-tokenization regex. The direct methods on this type pre-tokenize the input text and should produce the same output as the tiktoken tokenizers. The type gives access to the regex and underlying byte-pair encoding if needed. Note that using the byte-pair encoding directly does not take the regex into account and may result in output that differs from tiktoken.

Fields§

§bpe: BytePairEncoding

The byte-pair encoding for this tokenizer.

§pre: Option<Pretokenizer>

The pattern regex used to split the input.

Struct TokenizerCopy item path

Fields§

Implementations§

impl Tokenizer

pub fn new(bpe: BytePairEncoding, pat: Option<&str>) -> Result<Self, BuildError>

pub fn new_lookahead( bpe: BytePairEncoding, patterns: &[(&str, bool)], ) -> Result<Self, BuildError>

pub fn count(&self, text: &str) -> usize

pub fn count_till_limit(&self, text: &str, token_limit: usize) -> Option<usize>

pub fn encode(&self, text: &str) -> Vec<u32>

pub fn decode(&self, tokens: &[u32]) -> Option<String>

pub fn split<'a>(&'a self, text: &'a str) -> impl Iterator<Item = &'a str> + 'a

Auto Trait Implementations§

impl Freeze for Tokenizer

impl RefUnwindSafe for Tokenizer

impl Send for Tokenizer

impl Sync for Tokenizer

impl Unpin for Tokenizer

impl UnwindSafe for Tokenizer

Blanket Implementations§

impl<T> Any for Twhere T: 'static + ?Sized,

fn type_id(&self) -> TypeId

impl<T> Borrow<T> for Twhere T: ?Sized,

fn borrow(&self) -> &T

impl<T> BorrowMut<T> for Twhere T: ?Sized,

fn borrow_mut(&mut self) -> &mut T

impl<T> From<T> for T

fn from(t: T) -> T

impl<T, U> Into<U> for Twhere U: From<T>,

fn into(self) -> U

impl<T> IntoEither for T

fn into_either(self, into_left: bool) -> Either<Self, Self>

fn into_either_with<F>(self, into_left: F) -> Either<Self, Self>where F: FnOnce(&Self) -> bool,

impl<T, U> TryFrom<U> for Twhere U: Into<T>,

type Error = Infallible

fn try_from(value: U) -> Result<T, <T as TryFrom<U>>::Error>

impl<T, U> TryInto<U> for Twhere U: TryFrom<T>,

type Error = <U as TryFrom<T>>::Error

fn try_into(self) -> Result<U, <U as TryFrom<T>>::Error>

Struct Tokenizer

impl<T> Any for T
where T: 'static + ?Sized,

impl<T> Borrow<T> for T
where T: ?Sized,

impl<T> BorrowMut<T> for T
where T: ?Sized,

impl<T, U> Into<U> for T
where U: From<T>,

fn into_either_with<F>(self, into_left: F) -> Either<Self, Self>
where F: FnOnce(&Self) -> bool,

impl<T, U> TryFrom<U> for T
where U: Into<T>,

impl<T, U> TryInto<U> for T
where U: TryFrom<T>,