pub struct Tokenizer { /* private fields */ }Expand description
Wrapper around the HuggingFace tokenizer.
Implementations§
Source§impl Tokenizer
impl Tokenizer
Sourcepub fn from_file(path: impl AsRef<Path>) -> Result<Self, TokenizerError>
pub fn from_file(path: impl AsRef<Path>) -> Result<Self, TokenizerError>
Load a tokenizer from a tokenizer.json file.
Sourcepub fn from_json(json: &str) -> Result<Self, TokenizerError>
pub fn from_json(json: &str) -> Result<Self, TokenizerError>
Load a tokenizer from a JSON string.
Sourcepub fn encode(&self, text: &str) -> Result<Vec<u32>, TokenizerError>
pub fn encode(&self, text: &str) -> Result<Vec<u32>, TokenizerError>
Encode text into token IDs.
Sourcepub fn encode_with_special(
&self,
text: &str,
) -> Result<Vec<u32>, TokenizerError>
pub fn encode_with_special( &self, text: &str, ) -> Result<Vec<u32>, TokenizerError>
Encode text with special tokens (e.g., BOS).
Sourcepub fn decode(&self, ids: &[u32]) -> Result<String, TokenizerError>
pub fn decode(&self, ids: &[u32]) -> Result<String, TokenizerError>
Decode token IDs back to text.
Sourcepub fn decode_one(&self, id: u32) -> Result<String, TokenizerError>
pub fn decode_one(&self, id: u32) -> Result<String, TokenizerError>
Decode a single token ID to text.
Sourcepub fn vocab_size(&self) -> usize
pub fn vocab_size(&self) -> usize
Get the vocabulary size.
Sourcepub fn token_to_id(&self, token: &str) -> Option<u32>
pub fn token_to_id(&self, token: &str) -> Option<u32>
Get the token ID for a special token by content (e.g., “<|endoftext|>”).
Sourcepub fn bos_token_id(&self) -> Option<u32>
pub fn bos_token_id(&self) -> Option<u32>
Get the BOS (beginning of sequence) token ID, if defined.
Sourcepub fn eos_token_id(&self) -> Option<u32>
pub fn eos_token_id(&self) -> Option<u32>
Get the EOS (end of sequence) token ID, if defined.
Sourcepub fn stop_token_ids(&self) -> Vec<u32>
pub fn stop_token_ids(&self) -> Vec<u32>
Get all stop token IDs (EOS + chat-specific stop tokens). Used to detect when generation should stop.
Sourcepub fn is_stop_token(&self, token_id: u32) -> bool
pub fn is_stop_token(&self, token_id: u32) -> bool
Check if a token ID is a stop token.
Auto Trait Implementations§
impl !Freeze for Tokenizer
impl RefUnwindSafe for Tokenizer
impl Send for Tokenizer
impl Sync for Tokenizer
impl Unpin for Tokenizer
impl UnsafeUnpin for Tokenizer
impl UnwindSafe for Tokenizer
Blanket Implementations§
Source§impl<T> BorrowMut<T> for Twhere
T: ?Sized,
impl<T> BorrowMut<T> for Twhere
T: ?Sized,
Source§fn borrow_mut(&mut self) -> &mut T
fn borrow_mut(&mut self) -> &mut T
Mutably borrows from an owned value. Read more
Source§impl<T> IntoEither for T
impl<T> IntoEither for T
Source§fn into_either(self, into_left: bool) -> Either<Self, Self>
fn into_either(self, into_left: bool) -> Either<Self, Self>
Converts
self into a Left variant of Either<Self, Self>
if into_left is true.
Converts self into a Right variant of Either<Self, Self>
otherwise. Read moreSource§fn into_either_with<F>(self, into_left: F) -> Either<Self, Self>
fn into_either_with<F>(self, into_left: F) -> Either<Self, Self>
Converts
self into a Left variant of Either<Self, Self>
if into_left(&self) returns true.
Converts self into a Right variant of Either<Self, Self>
otherwise. Read more