pub struct EncodingWithOffsets {
pub ids: Vec<u32>,
pub tokens: Vec<String>,
pub offsets: Vec<(usize, usize)>,
}Expand description
Encoding result with tokens and their character offsets.
Produced by a tokenizer’s encode_with_offsets method (or equivalent).
Used to map between character positions in source text and token indices.
§Example
use candle_mi::EncodingWithOffsets;
let encoding = EncodingWithOffsets::new(
vec![1, 2, 3],
vec!["def".into(), " ".into(), "add".into()],
vec![(0, 3), (3, 4), (4, 7)],
);
// Character 4 ('a' in "add") is in token 2
assert_eq!(encoding.char_to_token(4), Some(2));Fields§
§ids: Vec<u32>Token IDs.
tokens: Vec<String>Token strings.
offsets: Vec<(usize, usize)>Character offset for each token: (start, end).
Implementations§
Source§impl EncodingWithOffsets
impl EncodingWithOffsets
Sourcepub const fn new(
ids: Vec<u32>,
tokens: Vec<String>,
offsets: Vec<(usize, usize)>,
) -> Self
pub const fn new( ids: Vec<u32>, tokens: Vec<String>, offsets: Vec<(usize, usize)>, ) -> Self
Create a new encoding with offsets.
Sourcepub fn tokens_with_offsets(&self) -> Vec<TokenWithOffset>
pub fn tokens_with_offsets(&self) -> Vec<TokenWithOffset>
Get tokens with their character offsets.
Sourcepub fn char_to_token(&self, char_pos: usize) -> Option<usize>
pub fn char_to_token(&self, char_pos: usize) -> Option<usize>
Find the token index that contains the given character position.
Returns None if no token spans that position.
Sourcepub fn char_to_token_fuzzy(&self, char_pos: usize) -> Option<usize>
pub fn char_to_token_fuzzy(&self, char_pos: usize) -> Option<usize>
Find the token index for a character position, with fuzzy fallback.
If the exact position isn’t contained in any token, returns the index of the closest token by midpoint distance.
Sourcepub fn char_to_token_start(&self, char_pos: usize) -> Option<usize>
pub fn char_to_token_start(&self, char_pos: usize) -> Option<usize>
Find the token index that starts at or after the given character position.
Sourcepub fn char_range_to_tokens(
&self,
start_char: usize,
end_char: usize,
) -> Vec<usize>
pub fn char_range_to_tokens( &self, start_char: usize, end_char: usize, ) -> Vec<usize>
Find all token indices that overlap with the given character range.
Trait Implementations§
Source§impl Clone for EncodingWithOffsets
impl Clone for EncodingWithOffsets
Source§fn clone(&self) -> EncodingWithOffsets
fn clone(&self) -> EncodingWithOffsets
1.0.0 · Source§fn clone_from(&mut self, source: &Self)
fn clone_from(&mut self, source: &Self)
source. Read moreAuto Trait Implementations§
impl Freeze for EncodingWithOffsets
impl RefUnwindSafe for EncodingWithOffsets
impl Send for EncodingWithOffsets
impl Sync for EncodingWithOffsets
impl Unpin for EncodingWithOffsets
impl UnsafeUnpin for EncodingWithOffsets
impl UnwindSafe for EncodingWithOffsets
Blanket Implementations§
Source§impl<T> BorrowMut<T> for Twhere
T: ?Sized,
impl<T> BorrowMut<T> for Twhere
T: ?Sized,
Source§fn borrow_mut(&mut self) -> &mut T
fn borrow_mut(&mut self) -> &mut T
Source§impl<T> CloneToUninit for Twhere
T: Clone,
impl<T> CloneToUninit for Twhere
T: Clone,
Source§impl<T> Instrument for T
impl<T> Instrument for T
Source§fn instrument(self, span: Span) -> Instrumented<Self>
fn instrument(self, span: Span) -> Instrumented<Self>
Source§fn in_current_span(self) -> Instrumented<Self>
fn in_current_span(self) -> Instrumented<Self>
Source§impl<T> IntoEither for T
impl<T> IntoEither for T
Source§fn into_either(self, into_left: bool) -> Either<Self, Self>
fn into_either(self, into_left: bool) -> Either<Self, Self>
self into a Left variant of Either<Self, Self>
if into_left is true.
Converts self into a Right variant of Either<Self, Self>
otherwise. Read moreSource§fn into_either_with<F>(self, into_left: F) -> Either<Self, Self>
fn into_either_with<F>(self, into_left: F) -> Either<Self, Self>
self into a Left variant of Either<Self, Self>
if into_left(&self) returns true.
Converts self into a Right variant of Either<Self, Self>
otherwise. Read more