use crate::types::ByteSpan;
use crate::types::Token;
#[derive(Default, Debug, Clone)]
pub struct Dictionary {
pub bytes: Vec<u8>,
pub offsets: Vec<u32>,
pub bits: u32,
}
impl Dictionary {
#[inline]
pub(crate) fn num_tokens(&self) -> usize {
if self.offsets.is_empty() {
0
} else {
self.offsets.len() - 1
}
}
#[inline]
pub(crate) fn span(&self, id: Token) -> ByteSpan {
ByteSpan {
begin: self.offsets[id as usize],
end: self.offsets[id as usize + 1],
}
}
#[inline]
pub(crate) fn data(&self, id: Token) -> &[u8] {
let s = self.span(id);
&self.bytes[s.begin as usize..s.end as usize]
}
#[inline]
#[allow(dead_code)] pub(crate) fn token_size(&self, id: Token) -> usize {
self.span(id).size() as usize
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn num_tokens_zero_when_offsets_empty() {
let d = Dictionary::default();
assert_eq!(d.num_tokens(), 0);
}
#[test]
fn num_tokens_is_offsets_len_minus_one() {
let d = Dictionary {
bytes: vec![],
offsets: vec![0, 3, 5, 8],
bits: 12,
};
assert_eq!(d.num_tokens(), 3);
}
#[test]
fn span_returns_correct_range() {
let d = Dictionary {
bytes: b"abcdef".to_vec(),
offsets: vec![0, 1, 3, 6],
bits: 12,
};
assert_eq!(d.span(0), ByteSpan { begin: 0, end: 1 });
assert_eq!(d.span(1), ByteSpan { begin: 1, end: 3 });
assert_eq!(d.span(2), ByteSpan { begin: 3, end: 6 });
}
#[test]
fn data_returns_correct_slice() {
let d = Dictionary {
bytes: b"abcdef".to_vec(),
offsets: vec![0, 1, 3, 6],
bits: 12,
};
assert_eq!(d.data(0), b"a");
assert_eq!(d.data(1), b"bc");
assert_eq!(d.data(2), b"def");
}
#[test]
fn token_size_consistent_with_span() {
let d = Dictionary {
bytes: b"abcdef".to_vec(),
offsets: vec![0, 1, 3, 6],
bits: 12,
};
for t in 0u16..3 {
assert_eq!(d.token_size(t), d.span(t).size() as usize);
}
}
}