pub struct TokenizerWrapper { /* private fields */ }Expand description
Thread-safe tokenizer wrapper
§Thread Safety
This wrapper uses Arc<Tokenizer> for thread-safe access.
Multiple threads can encode text concurrently using the same tokenizer.
§Performance
- Tokenization: ~0.1-0.5ms per input (100-500 tokens)
- Thread-safe without locks (immutable after creation)
- Batch encoding is more efficient than individual calls
§Example
let tokenizer = Arc::new(
TokenizerWrapper::from_pretrained(
"microsoft/deberta-v3-base",
TokenizerConfig::default(),
)?
);
let handles: Vec<_> = (0..4)
.map(|i| {
let tok = Arc::clone(&tokenizer);
thread::spawn(move || tok.encode(&format!("Text {}", i)))
})
.collect();
for handle in handles {
let encoding = handle.join().unwrap()?;
println!("Encoded {} tokens", encoding.len());
}Implementations§
Source§impl TokenizerWrapper
impl TokenizerWrapper
Sourcepub fn from_pretrained(
model_name: &str,
config: TokenizerConfig,
) -> Result<Self>
pub fn from_pretrained( model_name: &str, config: TokenizerConfig, ) -> Result<Self>
Load a tokenizer from HuggingFace Hub
§Arguments
model_name- HuggingFace model identifier (e.g., “microsoft/deberta-v3-base”)config- Tokenizer configuration
§Supported Models
- DeBERTa:
microsoft/deberta-v3-base(PromptInjection) - RoBERTa:
roberta-base(Toxicity, Sentiment) - BERT:
bert-base-uncased - Any HuggingFace model with a tokenizer
§Example
let tokenizer = TokenizerWrapper::from_pretrained(
"microsoft/deberta-v3-base",
TokenizerConfig::default(),
)?;Sourcepub fn encode_batch(&self, texts: &[&str]) -> Result<Vec<Encoding>>
pub fn encode_batch(&self, texts: &[&str]) -> Result<Vec<Encoding>>
Encode multiple texts in batch
Batch encoding is more efficient than encoding texts individually.
§Arguments
texts- Slice of text strings
§Returns
Vector of Encoding results (one per input text)
§Example
let texts = vec!["First text", "Second text", "Third text"];
let encodings = tokenizer.encode_batch(&texts)?;
assert_eq!(encodings.len(), 3);
for encoding in encodings {
println!("Length: {}", encoding.len());
}Sourcepub fn config(&self) -> &TokenizerConfig
pub fn config(&self) -> &TokenizerConfig
Get the tokenizer configuration
Sourcepub fn vocab_size(&self) -> usize
pub fn vocab_size(&self) -> usize
Get the vocabulary size
Returns the size of the tokenizer’s vocabulary.
Trait Implementations§
Auto Trait Implementations§
impl Freeze for TokenizerWrapper
impl RefUnwindSafe for TokenizerWrapper
impl Send for TokenizerWrapper
impl Sync for TokenizerWrapper
impl Unpin for TokenizerWrapper
impl UnwindSafe for TokenizerWrapper
Blanket Implementations§
Source§impl<T> BorrowMut<T> for Twhere
T: ?Sized,
impl<T> BorrowMut<T> for Twhere
T: ?Sized,
Source§fn borrow_mut(&mut self) -> &mut T
fn borrow_mut(&mut self) -> &mut T
Mutably borrows from an owned value. Read more
Source§impl<T> CloneToUninit for Twhere
T: Clone,
impl<T> CloneToUninit for Twhere
T: Clone,
Source§impl<T> Instrument for T
impl<T> Instrument for T
Source§fn instrument(self, span: Span) -> Instrumented<Self>
fn instrument(self, span: Span) -> Instrumented<Self>
Source§fn in_current_span(self) -> Instrumented<Self>
fn in_current_span(self) -> Instrumented<Self>
Source§impl<T> IntoEither for T
impl<T> IntoEither for T
Source§fn into_either(self, into_left: bool) -> Either<Self, Self>
fn into_either(self, into_left: bool) -> Either<Self, Self>
Converts
self into a Left variant of Either<Self, Self>
if into_left is true.
Converts self into a Right variant of Either<Self, Self>
otherwise. Read moreSource§fn into_either_with<F>(self, into_left: F) -> Either<Self, Self>
fn into_either_with<F>(self, into_left: F) -> Either<Self, Self>
Converts
self into a Left variant of Either<Self, Self>
if into_left(&self) returns true.
Converts self into a Right variant of Either<Self, Self>
otherwise. Read more