pub struct Tokenizer { /* private fields */ }Expand description
Implementations§
Source§impl Tokenizer
impl Tokenizer
Sourcepub fn to_file(&self, path: impl AsRef<Path>) -> Result<(), SerdeError>
pub fn to_file(&self, path: impl AsRef<Path>) -> Result<(), SerdeError>
Save the tokenizer to a file.
This saves the pre-built DAAC state, enabling fast loading without rebuilding the automaton.
Sourcepub fn save<W: Write>(&self, writer: &mut W) -> Result<(), SerdeError>
pub fn save<W: Write>(&self, writer: &mut W) -> Result<(), SerdeError>
Save the tokenizer to a writer.
Source§impl Tokenizer
impl Tokenizer
pub fn new( encoder: Encoder, decoder: Decoder, pretokenizer_type: PretokType, normalizer: Normalizer, post_processor: PostProcessor, ) -> Self
Sourcepub fn set_added_tokens(&mut self, tokens: &[(TokenId, Vec<u8>)])
pub fn set_added_tokens(&mut self, tokens: &[(TokenId, Vec<u8>)])
Set added tokens matcher for non-special added tokens. These are matched BEFORE pretokenization, like HuggingFace does.
pub fn pretokenizer_type(&self) -> PretokType
pub fn normalizer(&self) -> Normalizer
pub fn post_processor(&self) -> &PostProcessor
pub fn encoder_type(&self) -> EncoderType
pub fn decoder_type(&self) -> DecoderType
pub fn encoder(&self) -> &Encoder
pub fn decoder(&self) -> &Decoder
pub fn pretokenizer(&self) -> Option<&Pretok>
pub fn vocab_size(&self) -> usize
pub fn pad_token_id(&self) -> Option<TokenId>
pub fn padding(&self) -> Option<&PaddingParams>
pub fn truncation(&self) -> Option<&TruncationParams>
Sourcepub fn num_special_tokens_to_add(&self, is_pair: bool) -> usize
pub fn num_special_tokens_to_add(&self, is_pair: bool) -> usize
Number of special tokens added for a single sequence.
Sourcepub fn from_json(path: impl AsRef<Path>) -> Result<Self, JsonLoadError>
pub fn from_json(path: impl AsRef<Path>) -> Result<Self, JsonLoadError>
Load from a HuggingFace tokenizer.json file.
Sourcepub fn from_json_with_encoder(
path: impl AsRef<Path>,
encoder_type: EncoderType,
) -> Result<Self, JsonLoadError>
pub fn from_json_with_encoder( path: impl AsRef<Path>, encoder_type: EncoderType, ) -> Result<Self, JsonLoadError>
Load from a HuggingFace tokenizer.json with a specific encoder type.
pub fn enable_padding(&mut self, params: PaddingParams) -> &mut Self
pub fn enable_truncation(&mut self, params: TruncationParams) -> &mut Self
pub fn no_padding(&mut self) -> &mut Self
pub fn no_truncation(&mut self) -> &mut Self
pub fn set_pad_token_id(&mut self, id: TokenId) -> &mut Self
Sourcepub fn id_to_token(&self, id: TokenId) -> Option<Cow<'_, str>>
pub fn id_to_token(&self, id: TokenId) -> Option<Cow<'_, str>>
Get the token string for a given token ID. Returns lossy UTF-8 for byte-level tokens that aren’t valid UTF-8.
Sourcepub fn token_to_id(&self, token: &str) -> Option<TokenId>
pub fn token_to_id(&self, token: &str) -> Option<TokenId>
Look up a token string and return its token ID (O(1) after first call).
Sourcepub fn get_vocab(&self) -> HashMap<String, TokenId>
pub fn get_vocab(&self) -> HashMap<String, TokenId>
Get the full vocabulary as a map from token strings to token IDs.
Sourcepub fn token_to_bytes(&self, token: TokenId) -> &[u8] ⓘ
pub fn token_to_bytes(&self, token: TokenId) -> &[u8] ⓘ
Get the byte sequence for a token.
Sourcepub fn encode_with_offsets(
&self,
text: &str,
add_special_tokens: bool,
) -> Encoding
pub fn encode_with_offsets( &self, text: &str, add_special_tokens: bool, ) -> Encoding
Encode text with byte offsets for each token.
Returns an Encoding with offsets populated — each entry is a (start, end)
byte range in the (normalized) input text corresponding to that token.
Special tokens (CLS, SEP, BOS) get offset (0, 0).
§Example
let enc = tokenizer.encode_with_offsets("Hello, world!", true);
for (id, (start, end)) in enc.ids.iter().zip(&enc.offsets) {
println!("token {} -> bytes {}..{}", id, start, end);
}Sourcepub fn encode_pair(
&self,
text_a: &str,
text_b: &str,
add_special_tokens: bool,
) -> Encoding
pub fn encode_pair( &self, text_a: &str, text_b: &str, add_special_tokens: bool, ) -> Encoding
Sourcepub fn encode_bytes(&self, bytes: &[u8]) -> Vec<TokenId> ⓘ
pub fn encode_bytes(&self, bytes: &[u8]) -> Vec<TokenId> ⓘ
Encode raw bytes directly (bypasses pretokenizer and normalizer).
Sourcepub fn encode_iter<'a>(&'a self, text: &'a str) -> TokenizeIter<'a> ⓘ
pub fn encode_iter<'a>(&'a self, text: &'a str) -> TokenizeIter<'a> ⓘ
Streaming iterator over encoded tokens.
Sourcepub fn encode_bytes_iter<'a>(&'a self, bytes: &'a [u8]) -> EncoderIter<'a> ⓘ
pub fn encode_bytes_iter<'a>(&'a self, bytes: &'a [u8]) -> EncoderIter<'a> ⓘ
Streaming iterator over encoded tokens from bytes (bypasses pretokenizer).
Sourcepub fn decode(&self, tokens: &[TokenId]) -> Option<String>
pub fn decode(&self, tokens: &[TokenId]) -> Option<String>
Decode token IDs back to a string, applying text-level post-processing.
Behavior depends on the DecoderType:
- WordPiece: Strips
##continuation prefixes, joins tokens with spaces, and skips special tokens (CLS, SEP, etc.) - Metaspace (SentencePiece/Unigram): Replaces
▁with spaces, strips leading space - ByteLevel (BPE): Direct byte concatenation (already correct)
Returns None if the result is not valid UTF-8.
Sourcepub fn decode_bytes(&self, tokens: &[TokenId]) -> Vec<u8> ⓘ
pub fn decode_bytes(&self, tokens: &[TokenId]) -> Vec<u8> ⓘ
Raw byte-level decode without text post-processing.
Sourcepub fn decode_batch(&self, sequences: &[&[TokenId]]) -> Vec<Option<String>>
pub fn decode_batch(&self, sequences: &[&[TokenId]]) -> Vec<Option<String>>
Decode multiple token sequences in parallel.
Sourcepub fn count_tokens_batch(&self, texts: &[&str]) -> Vec<usize>
pub fn count_tokens_batch(&self, texts: &[&str]) -> Vec<usize>
Count tokens for multiple texts in parallel.
Sourcepub fn count_tokens(&self, text: &str) -> usize
pub fn count_tokens(&self, text: &str) -> usize
Count tokens without storing them (no special tokens).