Struct Tokenizer

Source

pub struct Tokenizer { /* private fields */ }

Expand description

High-level tokenizer combining pre-tokenization, encoding, and decoding.

§Example

use tokie::Tokenizer;

let tokenizer = Tokenizer::from_json("tokenizer.json")?;
let enc = tokenizer.encode("Hello, world!", false);
let text = tokenizer.decode(&enc.ids);

Implementations§

Source §

impl Tokenizer

Source

pub fn to_file(&self, path: impl AsRef<Path>) -> Result<(), SerdeError>

Save the tokenizer to a file.

This saves the pre-built DAAC state, enabling fast loading without rebuilding the automaton.

Source

pub fn save<W: Write>(&self, writer: &mut W) -> Result<(), SerdeError>

Save the tokenizer to a writer.

Source

pub fn from_file(path: impl AsRef<Path>) -> Result<Self, SerdeError>

Load a tokenizer from a file.

This loads pre-built DAAC state for instant use without rebuilding.

Source

pub fn load<R: Read>(reader: &mut R) -> Result<Self, SerdeError>

Load a tokenizer from a reader.

Source §

impl Tokenizer

Source

pub fn new( encoder: Encoder, decoder: Decoder, pretokenizer_type: PretokType, normalizer: Normalizer, post_processor: PostProcessor, ) -> Self

Source

pub fn set_added_tokens(&mut self, tokens: &[(TokenId, Vec<u8>)])

Set added tokens matcher for non-special added tokens. These are matched BEFORE pretokenization, like HuggingFace does.

Source

pub fn num_special_tokens_to_add(&self, is_pair: bool) -> usize

Number of special tokens added for a single sequence.

Source

pub fn from_json(path: impl AsRef<Path>) -> Result<Self, JsonLoadError>

Load from a HuggingFace tokenizer.json file.

Source

pub fn from_json_with_encoder( path: impl AsRef<Path>, encoder_type: EncoderType, ) -> Result<Self, JsonLoadError>

Load from a HuggingFace tokenizer.json with a specific encoder type.

Source

pub fn enable_padding(&mut self, params: PaddingParams) -> &mut Self

Source

pub fn enable_truncation(&mut self, params: TruncationParams) -> &mut Self

Source

pub fn no_padding(&mut self) -> &mut Self

Source

pub fn no_truncation(&mut self) -> &mut Self

Source

pub fn set_pad_token_id(&mut self, id: TokenId) -> &mut Self

Source

pub fn id_to_token(&self, id: TokenId) -> Option<Cow<'_, str>>

Get the token string for a given token ID. Returns lossy UTF-8 for byte-level tokens that aren’t valid UTF-8.

Source

pub fn token_to_id(&self, token: &str) -> Option<TokenId>

Look up a token string and return its token ID (O(1) after first call).

Source

pub fn get_vocab(&self) -> HashMap<String, TokenId>

Get the full vocabulary as a map from token strings to token IDs.

Source

pub fn token_to_bytes(&self, token: TokenId) -> &[u8] ⓘ

Get the byte sequence for a token.

Source

pub fn encode(&self, text: &str, add_special_tokens: bool) -> Encoding

Encode text into an Encoding with token IDs, attention mask, and type IDs.

§Example

let enc = tokenizer.encode("Hello, world!", true);
println!("{:?}", enc.ids);

Source

pub fn encode_with_offsets( &self, text: &str, add_special_tokens: bool, ) -> Encoding

Encode text with byte offsets for each token.

Returns an Encoding with offsets populated — each entry is a (start, end) byte range in the (normalized) input text corresponding to that token.

Special tokens (CLS, SEP, BOS) get offset (0, 0).

§Example

let enc = tokenizer.encode_with_offsets("Hello, world!", true);
for (id, (start, end)) in enc.ids.iter().zip(&enc.offsets) {
    println!("token {} -> bytes {}..{}", id, start, end);
}

Source

pub fn encode_pair( &self, text_a: &str, text_b: &str, add_special_tokens: bool, ) -> Encoding

Encode a pair of texts (e.g. for cross-encoder models).

§Example

let enc = tokenizer.encode_pair("What is Berlin?", "Berlin is the capital.", true);

Source

pub fn encode_bytes(&self, bytes: &[u8]) -> Vec<TokenId> ⓘ

Encode raw bytes directly (bypasses pretokenizer and normalizer).

Source

pub fn encode_iter<'a>(&'a self, text: &'a str) -> TokenizeIter<'a> ⓘ

Streaming iterator over encoded tokens.

Source

pub fn encode_bytes_iter<'a>(&'a self, bytes: &'a [u8]) -> EncoderIter<'a> ⓘ

Streaming iterator over encoded tokens from bytes (bypasses pretokenizer).

Source

pub fn decode(&self, tokens: &[TokenId]) -> Option<String>

Decode token IDs back to a string, applying text-level post-processing.

Behavior depends on the DecoderType:

WordPiece: Strips ## continuation prefixes, joins tokens with spaces, and skips special tokens (CLS, SEP, etc.)
Metaspace (SentencePiece/Unigram): Replaces ▁ with spaces, strips leading space
ByteLevel (BPE): Direct byte concatenation (already correct)

Returns None if the result is not valid UTF-8.

Source

pub fn decode_bytes(&self, tokens: &[TokenId]) -> Vec<u8> ⓘ

Raw byte-level decode without text post-processing.

Source

pub fn decode_batch(&self, sequences: &[&[TokenId]]) -> Vec<Option<String>>

Decode multiple token sequences in parallel.

Source

pub fn encode_batch( &self, texts: &[&str], add_special_tokens: bool, ) -> Vec<Encoding>

Encode multiple texts in parallel, with optional padding.

§Example

let encodings = tokenizer.encode_batch(&["Hello!", "World"], true);

Source

pub fn count_tokens_batch(&self, texts: &[&str]) -> Vec<usize>

Count tokens for multiple texts in parallel.

Source

pub fn count_tokens(&self, text: &str) -> usize

Count tokens without storing them (no special tokens).

Source

pub fn token_count<'a>(&'a self, text: &'a str) -> TokenCount<'a>

Lazy token count with early termination for comparisons.

§Example

if tokenizer.token_count(text) > 8192 {
    println!("text exceeds context window");
}

Auto Trait Implementations§

§

impl !Freeze for Tokenizer

§

impl RefUnwindSafe for Tokenizer

§

impl Send for Tokenizer

§

impl Sync for Tokenizer

§

impl Unpin for Tokenizer

§

impl UnsafeUnpin for Tokenizer

§

impl UnwindSafe for Tokenizer

Blanket Implementations§

Source §

impl<T> Any for T
where T: 'static + ?Sized,

Source §

fn type_id(&self) -> TypeId

Gets the TypeId of self. Read more

Source §

impl<T> Borrow<T> for T
where T: ?Sized,

Source §

fn borrow(&self) -> &T

Immutably borrows from an owned value. Read more

Source §

impl<T> BorrowMut<T> for T
where T: ?Sized,

Source §

fn borrow_mut(&mut self) -> &mut T

Mutably borrows from an owned value. Read more

Source §

impl<T> From<T> for T

Source §

fn from(t: T) -> T

Returns the argument unchanged.

Source §

impl<T, U> Into for T
where U: From<T>,

Source §

fn into(self) -> U

Calls U::from(self).

That is, this conversion is whatever the implementation of From<T> for U chooses to do.

Source §

impl<T, U> TryFrom for T
where U: Into<T>,

Source §

type Error = Infallible

The type returned in the event of a conversion error.

Source §

fn try_from(value: U) -> Result<T, <T as TryFrom>::Error>

Performs the conversion.

Source §

impl<T, U> TryInto for T
where U: TryFrom<T>,

Source §

type Error = >::Error

The type returned in the event of a conversion error.

Source §

fn try_into(self) -> Result<U, >::Error>

Performs the conversion.

Source §

impl<T> ErasedDestructor for T
where T: 'static,

Source §

Tokenizer

Struct Tokenizer Copy item path

§Example

Implementations§

impl Tokenizer

pub fn to_file(&self, path: impl AsRef<Path>) -> Result<(), SerdeError>

pub fn save<W: Write>(&self, writer: &mut W) -> Result<(), SerdeError>

pub fn from_file(path: impl AsRef<Path>) -> Result<Self, SerdeError>

pub fn load<R: Read>(reader: &mut R) -> Result<Self, SerdeError>

impl Tokenizer

pub fn new( encoder: Encoder, decoder: Decoder, pretokenizer_type: PretokType, normalizer: Normalizer, post_processor: PostProcessor, ) -> Self

pub fn set_added_tokens(&mut self, tokens: &[(TokenId, Vec<u8>)])

pub fn pretokenizer_type(&self) -> PretokType

pub fn normalizer(&self) -> Normalizer

pub fn post_processor(&self) -> &PostProcessor

pub fn encoder_type(&self) -> EncoderType

pub fn decoder_type(&self) -> DecoderType

pub fn encoder(&self) -> &Encoder

pub fn decoder(&self) -> &Decoder

pub fn pretokenizer(&self) -> Option<&Pretok>

pub fn vocab_size(&self) -> usize

pub fn pad_token_id(&self) -> Option<TokenId>

pub fn padding(&self) -> Option<&PaddingParams>

pub fn truncation(&self) -> Option<&TruncationParams>

pub fn num_special_tokens_to_add(&self, is_pair: bool) -> usize

pub fn from_json(path: impl AsRef<Path>) -> Result<Self, JsonLoadError>

pub fn from_json_with_encoder( path: impl AsRef<Path>, encoder_type: EncoderType, ) -> Result<Self, JsonLoadError>

pub fn enable_padding(&mut self, params: PaddingParams) -> &mut Self

pub fn enable_truncation(&mut self, params: TruncationParams) -> &mut Self

pub fn no_padding(&mut self) -> &mut Self

pub fn no_truncation(&mut self) -> &mut Self

pub fn set_pad_token_id(&mut self, id: TokenId) -> &mut Self

pub fn id_to_token(&self, id: TokenId) -> Option<Cow<'_, str>>

pub fn token_to_id(&self, token: &str) -> Option<TokenId>

pub fn get_vocab(&self) -> HashMap<String, TokenId>

pub fn token_to_bytes(&self, token: TokenId) -> &[u8] ⓘ

pub fn encode(&self, text: &str, add_special_tokens: bool) -> Encoding

§Example

pub fn encode_with_offsets( &self, text: &str, add_special_tokens: bool, ) -> Encoding

§Example

pub fn encode_pair( &self, text_a: &str, text_b: &str, add_special_tokens: bool, ) -> Encoding

§Example

pub fn encode_bytes(&self, bytes: &[u8]) -> Vec<TokenId> ⓘ

pub fn encode_iter<'a>(&'a self, text: &'a str) -> TokenizeIter<'a> ⓘ

pub fn encode_bytes_iter<'a>(&'a self, bytes: &'a [u8]) -> EncoderIter<'a> ⓘ

pub fn decode(&self, tokens: &[TokenId]) -> Option<String>

pub fn decode_bytes(&self, tokens: &[TokenId]) -> Vec<u8> ⓘ

pub fn decode_batch(&self, sequences: &[&[TokenId]]) -> Vec<Option<String>>

pub fn encode_batch( &self, texts: &[&str], add_special_tokens: bool, ) -> Vec<Encoding>

§Example

pub fn count_tokens_batch(&self, texts: &[&str]) -> Vec<usize>

pub fn count_tokens(&self, text: &str) -> usize

pub fn token_count<'a>(&'a self, text: &'a str) -> TokenCount<'a>

§Example

Auto Trait Implementations§

impl !Freeze for Tokenizer

impl RefUnwindSafe for Tokenizer

impl Send for Tokenizer

impl Sync for Tokenizer

impl Unpin for Tokenizer

impl UnsafeUnpin for Tokenizer

impl UnwindSafe for Tokenizer

Blanket Implementations§

impl<T> Any for Twhere T: 'static + ?Sized,

fn type_id(&self) -> TypeId

impl<T> Borrow<T> for Twhere T: ?Sized,

fn borrow(&self) -> &T

impl<T> BorrowMut<T> for Twhere T: ?Sized,

fn borrow_mut(&mut self) -> &mut T

impl<T> From<T> for T

fn from(t: T) -> T

impl<T, U> Into<U> for Twhere U: From<T>,

fn into(self) -> U

impl<T, U> TryFrom<U> for Twhere U: Into<T>,

type Error = Infallible

fn try_from(value: U) -> Result<T, <T as TryFrom<U>>::Error>

impl<T, U> TryInto<U> for Twhere U: TryFrom<T>,

type Error = <U as TryFrom<T>>::Error

fn try_into(self) -> Result<U, <U as TryFrom<T>>::Error>

impl<T> ErasedDestructor for Twhere T: 'static,

Struct Tokenizer

impl<T> Any for T
where T: 'static + ?Sized,

impl<T> Borrow<T> for T
where T: ?Sized,

impl<T> BorrowMut<T> for T
where T: ?Sized,

impl<T, U> Into<U> for T
where U: From<T>,

impl<T, U> TryFrom<U> for T
where U: Into<T>,

impl<T, U> TryInto<U> for T
where U: TryFrom<T>,

impl<T> ErasedDestructor for T
where T: 'static,