pub struct Tokenizer {
pub tokenizer: TokenizerBackend,
pub tokenizer_path: Option<PathBuf>,
pub with_special_tokens: bool,
pub white_space_token_id: u32,
}Fields§
§tokenizer: TokenizerBackend§tokenizer_path: Option<PathBuf>§with_special_tokens: bool§white_space_token_id: u32Implementations§
Source§impl Tokenizer
impl Tokenizer
pub fn new_tiktoken<S: AsRef<str>>(model_id: S) -> Result<Self>
pub fn new_from_tokenizer(tokenizer: HFTokenizer) -> Result<Self>
pub fn new_from_tokenizer_json<P: AsRef<Path>>(local_path: P) -> Result<Self>
pub fn new_from_hf_repo<S: AsRef<str>>( hf_token: Option<S>, repo_id: S, ) -> Result<Self>
pub fn tokenize<T: AsRef<str>>(&self, str: T) -> Vec<u32>
pub fn detokenize_one(&self, token: u32) -> Result<String>
pub fn detokenize_many(&self, tokens: &[u32]) -> Result<String>
pub fn count_tokens(&self, str: &str) -> u32
pub fn try_from_single_token_id( &self, try_from_single_token_id: u32, ) -> Result<String>
pub fn try_into_single_token(&self, try_into_single_token: &str) -> Result<u32>
Sourcepub fn create_text_window(&self, text: &str, target_token_size: u32) -> String
pub fn create_text_window(&self, text: &str, target_token_size: u32) -> String
Creates a window of text normalized to the specified token size in the center of the text.
§Arguments
text- The input text to create a window from.target_token_size- The desired number of tokens in the window.
§Returns
A new string that represents the normalized window of text, or the original
text if its token count is less than or equal to target_token_size.
Sourcepub fn create_text_range(
&self,
text: &str,
start_token_index: u32,
end_token_index: u32,
) -> String
pub fn create_text_range( &self, text: &str, start_token_index: u32, end_token_index: u32, ) -> String
Creates a range of text from the specified start and end token indices.
§Arguments
text- The input text to create a window from.target_token_size- The desired number of tokens in the window.
§Returns
A new string that represents the normalized window of text, or the original
text if its token count is less than or equal to target_token_size.
Trait Implementations§
Auto Trait Implementations§
impl Freeze for Tokenizer
impl RefUnwindSafe for Tokenizer
impl Send for Tokenizer
impl Sync for Tokenizer
impl Unpin for Tokenizer
impl UnwindSafe for Tokenizer
Blanket Implementations§
Source§impl<T> BorrowMut<T> for Twhere
T: ?Sized,
impl<T> BorrowMut<T> for Twhere
T: ?Sized,
Source§fn borrow_mut(&mut self) -> &mut T
fn borrow_mut(&mut self) -> &mut T
Mutably borrows from an owned value. Read more
Source§impl<T> Instrument for T
impl<T> Instrument for T
Source§fn instrument(self, span: Span) -> Instrumented<Self>
fn instrument(self, span: Span) -> Instrumented<Self>
Source§fn in_current_span(self) -> Instrumented<Self>
fn in_current_span(self) -> Instrumented<Self>
Source§impl<T> IntoEither for T
impl<T> IntoEither for T
Source§fn into_either(self, into_left: bool) -> Either<Self, Self>
fn into_either(self, into_left: bool) -> Either<Self, Self>
Converts
self into a Left variant of Either<Self, Self>
if into_left is true.
Converts self into a Right variant of Either<Self, Self>
otherwise. Read moreSource§fn into_either_with<F>(self, into_left: F) -> Either<Self, Self>
fn into_either_with<F>(self, into_left: F) -> Either<Self, Self>
Converts
self into a Left variant of Either<Self, Self>
if into_left(&self) returns true.
Converts self into a Right variant of Either<Self, Self>
otherwise. Read more