Skip to main content

Tokenizer

Struct Tokenizer 

Source
pub struct Tokenizer { /* private fields */ }
Expand description

High-level tokenizer combining pre-tokenization, encoding, and decoding.

§Example

use tokie::Tokenizer;

let tokenizer = Tokenizer::from_json("tokenizer.json")?;
let enc = tokenizer.encode("Hello, world!", false);
let text = tokenizer.decode(&enc.ids);

Implementations§

Source§

impl Tokenizer

Source

pub fn to_file(&self, path: impl AsRef<Path>) -> Result<(), SerdeError>

Save the tokenizer to a file.

This saves the pre-built DAAC state, enabling fast loading without rebuilding the automaton.

Source

pub fn save<W: Write>(&self, writer: &mut W) -> Result<(), SerdeError>

Save the tokenizer to a writer.

Source

pub fn from_file(path: impl AsRef<Path>) -> Result<Self, SerdeError>

Load a tokenizer from a file.

This loads pre-built DAAC state for instant use without rebuilding.

Source

pub fn load<R: Read>(reader: &mut R) -> Result<Self, SerdeError>

Load a tokenizer from a reader.

Source§

impl Tokenizer

Source

pub fn new( encoder: Encoder, decoder: Decoder, pretokenizer_type: PretokType, normalizer: Normalizer, post_processor: PostProcessor, ) -> Self

Source

pub fn set_added_tokens(&mut self, tokens: &[(TokenId, Vec<u8>)])

Set added tokens matcher for non-special added tokens. These are matched BEFORE pretokenization, like HuggingFace does.

Source

pub fn pretokenizer_type(&self) -> PretokType

Source

pub fn normalizer(&self) -> Normalizer

Source

pub fn post_processor(&self) -> &PostProcessor

Source

pub fn encoder_type(&self) -> EncoderType

Source

pub fn decoder_type(&self) -> DecoderType

Source

pub fn encoder(&self) -> &Encoder

Source

pub fn decoder(&self) -> &Decoder

Source

pub fn pretokenizer(&self) -> Option<&Pretok>

Source

pub fn vocab_size(&self) -> usize

Source

pub fn pad_token_id(&self) -> Option<TokenId>

Source

pub fn padding(&self) -> Option<&PaddingParams>

Source

pub fn truncation(&self) -> Option<&TruncationParams>

Source

pub fn num_special_tokens_to_add(&self, is_pair: bool) -> usize

Number of special tokens added for a single sequence.

Source

pub fn from_json(path: impl AsRef<Path>) -> Result<Self, JsonLoadError>

Load from a HuggingFace tokenizer.json file.

Source

pub fn from_json_with_encoder( path: impl AsRef<Path>, encoder_type: EncoderType, ) -> Result<Self, JsonLoadError>

Load from a HuggingFace tokenizer.json with a specific encoder type.

Source

pub fn enable_padding(&mut self, params: PaddingParams) -> &mut Self

Source

pub fn enable_truncation(&mut self, params: TruncationParams) -> &mut Self

Source

pub fn no_padding(&mut self) -> &mut Self

Source

pub fn no_truncation(&mut self) -> &mut Self

Source

pub fn set_pad_token_id(&mut self, id: TokenId) -> &mut Self

Source

pub fn id_to_token(&self, id: TokenId) -> Option<Cow<'_, str>>

Get the token string for a given token ID. Returns lossy UTF-8 for byte-level tokens that aren’t valid UTF-8.

Source

pub fn token_to_id(&self, token: &str) -> Option<TokenId>

Look up a token string and return its token ID (O(1) after first call).

Source

pub fn get_vocab(&self) -> HashMap<String, TokenId>

Get the full vocabulary as a map from token strings to token IDs.

Source

pub fn token_to_bytes(&self, token: TokenId) -> &[u8]

Get the byte sequence for a token.

Source

pub fn encode(&self, text: &str, add_special_tokens: bool) -> Encoding

Encode text into an Encoding with token IDs, attention mask, and type IDs.

§Example
let enc = tokenizer.encode("Hello, world!", true);
println!("{:?}", enc.ids);
Source

pub fn encode_with_offsets( &self, text: &str, add_special_tokens: bool, ) -> Encoding

Encode text with byte offsets for each token.

Returns an Encoding with offsets populated — each entry is a (start, end) byte range in the (normalized) input text corresponding to that token.

Special tokens (CLS, SEP, BOS) get offset (0, 0).

§Example
let enc = tokenizer.encode_with_offsets("Hello, world!", true);
for (id, (start, end)) in enc.ids.iter().zip(&enc.offsets) {
    println!("token {} -> bytes {}..{}", id, start, end);
}
Source

pub fn encode_pair( &self, text_a: &str, text_b: &str, add_special_tokens: bool, ) -> Encoding

Encode a pair of texts (e.g. for cross-encoder models).

§Example
let enc = tokenizer.encode_pair("What is Berlin?", "Berlin is the capital.", true);
Source

pub fn encode_bytes(&self, bytes: &[u8]) -> Vec<TokenId>

Encode raw bytes directly (bypasses pretokenizer and normalizer).

Source

pub fn encode_iter<'a>(&'a self, text: &'a str) -> TokenizeIter<'a>

Streaming iterator over encoded tokens.

Source

pub fn encode_bytes_iter<'a>(&'a self, bytes: &'a [u8]) -> EncoderIter<'a>

Streaming iterator over encoded tokens from bytes (bypasses pretokenizer).

Source

pub fn decode(&self, tokens: &[TokenId]) -> Option<String>

Decode token IDs back to a string, applying text-level post-processing.

Behavior depends on the DecoderType:

  • WordPiece: Strips ## continuation prefixes, joins tokens with spaces, and skips special tokens (CLS, SEP, etc.)
  • Metaspace (SentencePiece/Unigram): Replaces with spaces, strips leading space
  • ByteLevel (BPE): Direct byte concatenation (already correct)

Returns None if the result is not valid UTF-8.

Source

pub fn decode_bytes(&self, tokens: &[TokenId]) -> Vec<u8>

Raw byte-level decode without text post-processing.

Source

pub fn decode_batch(&self, sequences: &[&[TokenId]]) -> Vec<Option<String>>

Decode multiple token sequences in parallel.

Source

pub fn encode_batch( &self, texts: &[&str], add_special_tokens: bool, ) -> Vec<Encoding>

Encode multiple texts in parallel, with optional padding.

§Example
let encodings = tokenizer.encode_batch(&["Hello!", "World"], true);
Source

pub fn count_tokens_batch(&self, texts: &[&str]) -> Vec<usize>

Count tokens for multiple texts in parallel.

Source

pub fn count_tokens(&self, text: &str) -> usize

Count tokens without storing them (no special tokens).

Source

pub fn token_count<'a>(&'a self, text: &'a str) -> TokenCount<'a>

Lazy token count with early termination for comparisons.

§Example
if tokenizer.token_count(text) > 8192 {
    println!("text exceeds context window");
}

Auto Trait Implementations§

Blanket Implementations§

Source§

impl<T> Any for T
where T: 'static + ?Sized,

Source§

fn type_id(&self) -> TypeId

Gets the TypeId of self. Read more
Source§

impl<T> Borrow<T> for T
where T: ?Sized,

Source§

fn borrow(&self) -> &T

Immutably borrows from an owned value. Read more
Source§

impl<T> BorrowMut<T> for T
where T: ?Sized,

Source§

fn borrow_mut(&mut self) -> &mut T

Mutably borrows from an owned value. Read more
Source§

impl<T> From<T> for T

Source§

fn from(t: T) -> T

Returns the argument unchanged.

Source§

impl<T, U> Into<U> for T
where U: From<T>,

Source§

fn into(self) -> U

Calls U::from(self).

That is, this conversion is whatever the implementation of From<T> for U chooses to do.

Source§

impl<T, U> TryFrom<U> for T
where U: Into<T>,

Source§

type Error = Infallible

The type returned in the event of a conversion error.
Source§

fn try_from(value: U) -> Result<T, <T as TryFrom<U>>::Error>

Performs the conversion.
Source§

impl<T, U> TryInto<U> for T
where U: TryFrom<T>,

Source§

type Error = <U as TryFrom<T>>::Error

The type returned in the event of a conversion error.
Source§

fn try_into(self) -> Result<U, <U as TryFrom<T>>::Error>

Performs the conversion.
Source§

impl<T> ErasedDestructor for T
where T: 'static,

Source§

impl<T> MaybeSendSync for T