pub struct HfBpeTokenizer { /* private fields */ }Expand description
Byte Pair Encoding tokenizer.
Implements subword tokenization using BPE algorithm.
Implementations§
Source§impl BpeTokenizer
impl BpeTokenizer
Sourcepub fn new(config: BpeConfig) -> BpeTokenizer
pub fn new(config: BpeConfig) -> BpeTokenizer
Create a new BPE tokenizer with given config
Sourcepub fn from_huggingface<P>(path: P) -> Result<BpeTokenizer, AprenderError>
pub fn from_huggingface<P>(path: P) -> Result<BpeTokenizer, AprenderError>
Load tokenizer from a HuggingFace tokenizer.json file path.
Parses the HuggingFace tokenizer.json format, extracting:
model.vocab(token-to-ID mapping)model.merges(ordered BPE merge rules)added_tokens(special tokens like<|endoftext|>,<|im_start|>)
The byte encoder for UTF-8 byte-level BPE is built automatically.
§Arguments
path- Path to aHuggingFacetokenizer.json file
§Returns
A fully loaded BpeTokenizer with vocabulary, merge rules, and special tokens.
§Errors
Returns error if the file cannot be read or the JSON is malformed.
§Example
use aprender::text::bpe::BpeTokenizer;
let tokenizer = BpeTokenizer::from_huggingface("path/to/tokenizer.json")
.expect("failed to load tokenizer");
assert!(tokenizer.vocab_size() > 0);
let ids = tokenizer.encode("Hello world");
assert!(!ids.is_empty());Sourcepub fn from_huggingface_json(json: &str) -> Result<BpeTokenizer, AprenderError>
pub fn from_huggingface_json(json: &str) -> Result<BpeTokenizer, AprenderError>
Load tokenizer from a HuggingFace tokenizer.json string.
This is the in-memory counterpart of from_huggingface.
Useful when the JSON has already been read into a string (e.g., from an HTTP response).
§Arguments
json- JSON string inHuggingFacetokenizer.json format
§Returns
A fully loaded BpeTokenizer.
§Errors
Returns error if JSON parsing fails or the structure is invalid.
Sourcepub fn from_vocab_merges<P, Q>(
vocab_path: P,
merges_path: Q,
) -> Result<BpeTokenizer, AprenderError>
pub fn from_vocab_merges<P, Q>( vocab_path: P, merges_path: Q, ) -> Result<BpeTokenizer, AprenderError>
Load tokenizer from legacy GPT-2/RoBERTa format (vocab.json + merges.txt).
CodeBERT and other RoBERTa-family models use this format instead of the
unified tokenizer.json. The vocab.json maps tokens to IDs, and merges.txt
contains ordered BPE merge rules (one per line, #version header skipped).
§Arguments
vocab_path- Path to vocab.json ({"token": id, ...})merges_path- Path to merges.txt (header + onepair1 pair2per line)
§Returns
A fully loaded BpeTokenizer with vocabulary and merge rules.
§Errors
Returns error if files cannot be read or JSON is malformed.
§Example
use aprender::text::bpe::BpeTokenizer;
let tokenizer = BpeTokenizer::from_vocab_merges(
"path/to/vocab.json",
"path/to/merges.txt",
).expect("failed to load tokenizer");
assert!(tokenizer.vocab_size() > 0);Sourcepub fn gpt2_base() -> BpeTokenizer
pub fn gpt2_base() -> BpeTokenizer
Create tokenizer with GPT-2 base vocabulary (stub)
§Note
Real implementation requires loading vocabulary files.
Sourcepub fn add_special_token(&mut self, token: &str, id: u32)
pub fn add_special_token(&mut self, token: &str, id: u32)
Add a special token
Sourcepub fn vocab_size(&self) -> usize
pub fn vocab_size(&self) -> usize
Get vocabulary size
Sourcepub fn token_to_id(&self, token: &str) -> Option<u32>
pub fn token_to_id(&self, token: &str) -> Option<u32>
Get token ID for a token
Sourcepub fn id_to_token(&self, id: u32) -> Option<&str>
pub fn id_to_token(&self, id: u32) -> Option<&str>
Get token for an ID
Sourcepub fn is_special_token(&self, token: &str) -> bool
pub fn is_special_token(&self, token: &str) -> bool
Check if token is a special token
Sourcepub fn encode_checked(&self, text: &str) -> Result<Vec<u32>, AprenderError>
pub fn encode_checked(&self, text: &str) -> Result<Vec<u32>, AprenderError>
Sourcepub fn decode_checked(&self, ids: &[u32]) -> Result<String, AprenderError>
pub fn decode_checked(&self, ids: &[u32]) -> Result<String, AprenderError>
Trait Implementations§
Source§impl Clone for BpeTokenizer
impl Clone for BpeTokenizer
Source§fn clone(&self) -> BpeTokenizer
fn clone(&self) -> BpeTokenizer
1.0.0 (const: unstable) · Source§fn clone_from(&mut self, source: &Self)
fn clone_from(&mut self, source: &Self)
source. Read moreSource§impl Debug for BpeTokenizer
impl Debug for BpeTokenizer
Source§impl Default for BpeTokenizer
impl Default for BpeTokenizer
Source§fn default() -> BpeTokenizer
fn default() -> BpeTokenizer
Auto Trait Implementations§
impl Freeze for BpeTokenizer
impl RefUnwindSafe for BpeTokenizer
impl Send for BpeTokenizer
impl Sync for BpeTokenizer
impl Unpin for BpeTokenizer
impl UnsafeUnpin for BpeTokenizer
impl UnwindSafe for BpeTokenizer
Blanket Implementations§
impl<T> Allocation for T
impl<T> Allocation for T
Source§impl<T> BorrowMut<T> for Twhere
T: ?Sized,
impl<T> BorrowMut<T> for Twhere
T: ?Sized,
Source§fn borrow_mut(&mut self) -> &mut T
fn borrow_mut(&mut self) -> &mut T
Source§impl<T> CloneToUninit for Twhere
T: Clone,
impl<T> CloneToUninit for Twhere
T: Clone,
Source§impl<T> FmtForward for T
impl<T> FmtForward for T
Source§fn fmt_binary(self) -> FmtBinary<Self>where
Self: Binary,
fn fmt_binary(self) -> FmtBinary<Self>where
Self: Binary,
self to use its Binary implementation when Debug-formatted.Source§fn fmt_display(self) -> FmtDisplay<Self>where
Self: Display,
fn fmt_display(self) -> FmtDisplay<Self>where
Self: Display,
self to use its Display implementation when
Debug-formatted.Source§fn fmt_lower_exp(self) -> FmtLowerExp<Self>where
Self: LowerExp,
fn fmt_lower_exp(self) -> FmtLowerExp<Self>where
Self: LowerExp,
self to use its LowerExp implementation when
Debug-formatted.Source§fn fmt_lower_hex(self) -> FmtLowerHex<Self>where
Self: LowerHex,
fn fmt_lower_hex(self) -> FmtLowerHex<Self>where
Self: LowerHex,
self to use its LowerHex implementation when
Debug-formatted.Source§fn fmt_octal(self) -> FmtOctal<Self>where
Self: Octal,
fn fmt_octal(self) -> FmtOctal<Self>where
Self: Octal,
self to use its Octal implementation when Debug-formatted.Source§fn fmt_pointer(self) -> FmtPointer<Self>where
Self: Pointer,
fn fmt_pointer(self) -> FmtPointer<Self>where
Self: Pointer,
self to use its Pointer implementation when
Debug-formatted.Source§fn fmt_upper_exp(self) -> FmtUpperExp<Self>where
Self: UpperExp,
fn fmt_upper_exp(self) -> FmtUpperExp<Self>where
Self: UpperExp,
self to use its UpperExp implementation when
Debug-formatted.Source§fn fmt_upper_hex(self) -> FmtUpperHex<Self>where
Self: UpperHex,
fn fmt_upper_hex(self) -> FmtUpperHex<Self>where
Self: UpperHex,
self to use its UpperHex implementation when
Debug-formatted.Source§impl<T> Instrument for T
impl<T> Instrument for T
Source§fn instrument(self, span: Span) -> Instrumented<Self>
fn instrument(self, span: Span) -> Instrumented<Self>
Source§fn in_current_span(self) -> Instrumented<Self>
fn in_current_span(self) -> Instrumented<Self>
Source§impl<T> IntoEither for T
impl<T> IntoEither for T
Source§fn into_either(self, into_left: bool) -> Either<Self, Self>
fn into_either(self, into_left: bool) -> Either<Self, Self>
self into a Left variant of Either<Self, Self>
if into_left is true.
Converts self into a Right variant of Either<Self, Self>
otherwise. Read moreSource§fn into_either_with<F>(self, into_left: F) -> Either<Self, Self>
fn into_either_with<F>(self, into_left: F) -> Either<Self, Self>
self into a Left variant of Either<Self, Self>
if into_left(&self) returns true.
Converts self into a Right variant of Either<Self, Self>
otherwise. Read moreSource§impl<T> Pipe for Twhere
T: ?Sized,
impl<T> Pipe for Twhere
T: ?Sized,
Source§fn pipe<R>(self, func: impl FnOnce(Self) -> R) -> Rwhere
Self: Sized,
fn pipe<R>(self, func: impl FnOnce(Self) -> R) -> Rwhere
Self: Sized,
Source§fn pipe_ref<'a, R>(&'a self, func: impl FnOnce(&'a Self) -> R) -> Rwhere
R: 'a,
fn pipe_ref<'a, R>(&'a self, func: impl FnOnce(&'a Self) -> R) -> Rwhere
R: 'a,
self and passes that borrow into the pipe function. Read moreSource§fn pipe_ref_mut<'a, R>(&'a mut self, func: impl FnOnce(&'a mut Self) -> R) -> Rwhere
R: 'a,
fn pipe_ref_mut<'a, R>(&'a mut self, func: impl FnOnce(&'a mut Self) -> R) -> Rwhere
R: 'a,
self and passes that borrow into the pipe function. Read moreSource§fn pipe_borrow<'a, B, R>(&'a self, func: impl FnOnce(&'a B) -> R) -> R
fn pipe_borrow<'a, B, R>(&'a self, func: impl FnOnce(&'a B) -> R) -> R
Source§fn pipe_borrow_mut<'a, B, R>(
&'a mut self,
func: impl FnOnce(&'a mut B) -> R,
) -> R
fn pipe_borrow_mut<'a, B, R>( &'a mut self, func: impl FnOnce(&'a mut B) -> R, ) -> R
Source§fn pipe_as_ref<'a, U, R>(&'a self, func: impl FnOnce(&'a U) -> R) -> R
fn pipe_as_ref<'a, U, R>(&'a self, func: impl FnOnce(&'a U) -> R) -> R
self, then passes self.as_ref() into the pipe function.Source§fn pipe_as_mut<'a, U, R>(&'a mut self, func: impl FnOnce(&'a mut U) -> R) -> R
fn pipe_as_mut<'a, U, R>(&'a mut self, func: impl FnOnce(&'a mut U) -> R) -> R
self, then passes self.as_mut() into the pipe
function.Source§fn pipe_deref<'a, T, R>(&'a self, func: impl FnOnce(&'a T) -> R) -> R
fn pipe_deref<'a, T, R>(&'a self, func: impl FnOnce(&'a T) -> R) -> R
self, then passes self.deref() into the pipe function.Source§impl<T> Pointable for T
impl<T> Pointable for T
Source§impl<T> PolicyExt for Twhere
T: ?Sized,
impl<T> PolicyExt for Twhere
T: ?Sized,
Source§impl<T> Tap for T
impl<T> Tap for T
Source§fn tap_borrow<B>(self, func: impl FnOnce(&B)) -> Self
fn tap_borrow<B>(self, func: impl FnOnce(&B)) -> Self
Borrow<B> of a value. Read moreSource§fn tap_borrow_mut<B>(self, func: impl FnOnce(&mut B)) -> Self
fn tap_borrow_mut<B>(self, func: impl FnOnce(&mut B)) -> Self
BorrowMut<B> of a value. Read moreSource§fn tap_ref<R>(self, func: impl FnOnce(&R)) -> Self
fn tap_ref<R>(self, func: impl FnOnce(&R)) -> Self
AsRef<R> view of a value. Read moreSource§fn tap_ref_mut<R>(self, func: impl FnOnce(&mut R)) -> Self
fn tap_ref_mut<R>(self, func: impl FnOnce(&mut R)) -> Self
AsMut<R> view of a value. Read moreSource§fn tap_deref<T>(self, func: impl FnOnce(&T)) -> Self
fn tap_deref<T>(self, func: impl FnOnce(&T)) -> Self
Deref::Target of a value. Read moreSource§fn tap_deref_mut<T>(self, func: impl FnOnce(&mut T)) -> Self
fn tap_deref_mut<T>(self, func: impl FnOnce(&mut T)) -> Self
Deref::Target of a value. Read moreSource§fn tap_dbg(self, func: impl FnOnce(&Self)) -> Self
fn tap_dbg(self, func: impl FnOnce(&Self)) -> Self
.tap() only in debug builds, and is erased in release builds.Source§fn tap_mut_dbg(self, func: impl FnOnce(&mut Self)) -> Self
fn tap_mut_dbg(self, func: impl FnOnce(&mut Self)) -> Self
.tap_mut() only in debug builds, and is erased in release
builds.Source§fn tap_borrow_dbg<B>(self, func: impl FnOnce(&B)) -> Self
fn tap_borrow_dbg<B>(self, func: impl FnOnce(&B)) -> Self
.tap_borrow() only in debug builds, and is erased in release
builds.Source§fn tap_borrow_mut_dbg<B>(self, func: impl FnOnce(&mut B)) -> Self
fn tap_borrow_mut_dbg<B>(self, func: impl FnOnce(&mut B)) -> Self
.tap_borrow_mut() only in debug builds, and is erased in release
builds.Source§fn tap_ref_dbg<R>(self, func: impl FnOnce(&R)) -> Self
fn tap_ref_dbg<R>(self, func: impl FnOnce(&R)) -> Self
.tap_ref() only in debug builds, and is erased in release
builds.Source§fn tap_ref_mut_dbg<R>(self, func: impl FnOnce(&mut R)) -> Self
fn tap_ref_mut_dbg<R>(self, func: impl FnOnce(&mut R)) -> Self
.tap_ref_mut() only in debug builds, and is erased in release
builds.Source§fn tap_deref_dbg<T>(self, func: impl FnOnce(&T)) -> Self
fn tap_deref_dbg<T>(self, func: impl FnOnce(&T)) -> Self
.tap_deref() only in debug builds, and is erased in release
builds.