use crate::BitMask;
use std::collections::BTreeMap;
#[derive(Debug, Clone, Default, PartialEq, Eq)]
pub struct ByteStringPool {
bytes: Vec<u8>,
offsets: Vec<u32>,
lens: Vec<u32>,
}
impl ByteStringPool {
pub fn new() -> Self {
Self::default()
}
pub fn len(&self) -> usize {
self.offsets.len()
}
pub fn is_empty(&self) -> bool {
self.offsets.is_empty()
}
pub fn byte_size(&self) -> usize {
self.bytes.len()
}
pub fn push(&mut self, bytes: &[u8]) -> Result<ByteStrView, ByteDictError> {
let len = u32::try_from(bytes.len()).map_err(|_| ByteDictError::EntryTooLong {
got: bytes.len(),
})?;
let new_total = self.bytes.len().checked_add(bytes.len()).ok_or(
ByteDictError::PoolOverflow {
attempted: bytes.len(),
current: self.bytes.len(),
},
)?;
let offset = u32::try_from(self.bytes.len()).map_err(|_| ByteDictError::PoolOverflow {
attempted: bytes.len(),
current: self.bytes.len(),
})?;
if new_total > u32::MAX as usize {
return Err(ByteDictError::PoolOverflow {
attempted: bytes.len(),
current: self.bytes.len(),
});
}
self.bytes.extend_from_slice(bytes);
self.offsets.push(offset);
self.lens.push(len);
Ok(ByteStrView { offset, len })
}
pub fn get(&self, view: ByteStrView) -> &[u8] {
let start = view.offset as usize;
let end = start.saturating_add(view.len as usize);
if end > self.bytes.len() {
debug_assert!(false, "ByteStrView out of bounds for pool");
return &[];
}
&self.bytes[start..end]
}
pub fn get_by_index(&self, idx: usize) -> Option<&[u8]> {
let offset = *self.offsets.get(idx)? as usize;
let len = *self.lens.get(idx)? as usize;
Some(&self.bytes[offset..offset + len])
}
pub fn iter(&self) -> impl Iterator<Item = (usize, &[u8])> + '_ {
self.offsets.iter().zip(self.lens.iter()).enumerate().map(
move |(i, (&off, &len))| {
let start = off as usize;
let end = start + len as usize;
(i, &self.bytes[start..end])
},
)
}
pub fn iter_views(&self) -> impl Iterator<Item = ByteStrView> + '_ {
self.offsets
.iter()
.zip(self.lens.iter())
.map(|(&offset, &len)| ByteStrView { offset, len })
}
}
#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)]
pub struct ByteStrView {
pub offset: u32,
pub len: u32,
}
impl ByteStrView {
pub fn byte_len(self) -> usize {
self.len as usize
}
pub fn is_empty(self) -> bool {
self.len == 0
}
}
#[derive(Debug, Clone, PartialEq, Eq)]
pub enum AdaptiveCodes {
U8(Vec<u8>),
U16(Vec<u16>),
U32(Vec<u32>),
U64(Vec<u64>),
}
impl AdaptiveCodes {
pub fn new() -> Self {
AdaptiveCodes::U8(Vec::new())
}
pub fn with_capacity(cap: usize) -> Self {
AdaptiveCodes::U8(Vec::with_capacity(cap))
}
pub fn len(&self) -> usize {
match self {
AdaptiveCodes::U8(v) => v.len(),
AdaptiveCodes::U16(v) => v.len(),
AdaptiveCodes::U32(v) => v.len(),
AdaptiveCodes::U64(v) => v.len(),
}
}
pub fn is_empty(&self) -> bool {
self.len() == 0
}
pub fn push(&mut self, code: u64) {
if self.needs_promotion_for(code) {
self.promote_to_fit(code);
}
match self {
AdaptiveCodes::U8(v) => v.push(code as u8),
AdaptiveCodes::U16(v) => v.push(code as u16),
AdaptiveCodes::U32(v) => v.push(code as u32),
AdaptiveCodes::U64(v) => v.push(code),
}
}
pub fn get(&self, i: usize) -> u64 {
match self {
AdaptiveCodes::U8(v) => v[i] as u64,
AdaptiveCodes::U16(v) => v[i] as u64,
AdaptiveCodes::U32(v) => v[i] as u64,
AdaptiveCodes::U64(v) => v[i],
}
}
pub fn iter(&self) -> Box<dyn Iterator<Item = u64> + '_> {
match self {
AdaptiveCodes::U8(v) => Box::new(v.iter().map(|&x| x as u64)),
AdaptiveCodes::U16(v) => Box::new(v.iter().map(|&x| x as u64)),
AdaptiveCodes::U32(v) => Box::new(v.iter().map(|&x| x as u64)),
AdaptiveCodes::U64(v) => Box::new(v.iter().copied()),
}
}
pub fn width_bytes(&self) -> usize {
match self {
AdaptiveCodes::U8(_) => 1,
AdaptiveCodes::U16(_) => 2,
AdaptiveCodes::U32(_) => 4,
AdaptiveCodes::U64(_) => 8,
}
}
fn needs_promotion_for(&self, code: u64) -> bool {
match self {
AdaptiveCodes::U8(_) => code > u8::MAX as u64,
AdaptiveCodes::U16(_) => code > u16::MAX as u64,
AdaptiveCodes::U32(_) => code > u32::MAX as u64,
AdaptiveCodes::U64(_) => false,
}
}
fn promote_to_fit(&mut self, code: u64) {
let old = std::mem::replace(self, AdaptiveCodes::U64(Vec::new()));
let target = if code <= u16::MAX as u64 {
let v: Vec<u16> = match old {
AdaptiveCodes::U8(v) => v.into_iter().map(|x| x as u16).collect(),
AdaptiveCodes::U16(v) => v,
AdaptiveCodes::U32(_) | AdaptiveCodes::U64(_) => {
debug_assert!(false, "promote_to_fit called with shrinking target");
Vec::new()
}
};
AdaptiveCodes::U16(v)
} else if code <= u32::MAX as u64 {
let v: Vec<u32> = match old {
AdaptiveCodes::U8(v) => v.into_iter().map(|x| x as u32).collect(),
AdaptiveCodes::U16(v) => v.into_iter().map(|x| x as u32).collect(),
AdaptiveCodes::U32(v) => v,
AdaptiveCodes::U64(_) => {
debug_assert!(false, "promote_to_fit called with shrinking target");
Vec::new()
}
};
AdaptiveCodes::U32(v)
} else {
let v: Vec<u64> = match old {
AdaptiveCodes::U8(v) => v.into_iter().map(|x| x as u64).collect(),
AdaptiveCodes::U16(v) => v.into_iter().map(|x| x as u64).collect(),
AdaptiveCodes::U32(v) => v.into_iter().map(|x| x as u64).collect(),
AdaptiveCodes::U64(v) => v,
};
AdaptiveCodes::U64(v)
};
*self = target;
}
}
impl Default for AdaptiveCodes {
fn default() -> Self {
Self::new()
}
}
#[derive(Debug, Clone, PartialEq, Eq)]
pub enum CategoryOrdering {
FirstSeen,
Lexical,
Explicit(Vec<Vec<u8>>),
}
#[derive(Debug, Clone, PartialEq, Eq)]
pub enum UnknownCategoryPolicy {
Error,
MapToOther { other_code: u64 },
MapToNull,
ExtendDictionary,
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum InternedCode {
Code(u64),
Null,
}
#[derive(Debug, Clone, PartialEq, Eq)]
pub enum ByteDictError {
EntryTooLong { got: usize },
PoolOverflow { attempted: usize, current: usize },
UnknownCategory,
Frozen,
ExplicitOrderingHasDuplicates,
OtherCodeNotInDictionary { code: u64 },
}
impl std::fmt::Display for ByteDictError {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
match self {
ByteDictError::EntryTooLong { got } => {
write!(f, "ByteStringPool entry too long: {got} bytes (max {})", u32::MAX)
}
ByteDictError::PoolOverflow { attempted, current } => write!(
f,
"ByteStringPool overflow: cannot append {attempted} bytes (current {current})"
),
ByteDictError::UnknownCategory => {
write!(f, "unknown category in frozen dictionary (Error policy)")
}
ByteDictError::Frozen => write!(f, "dictionary is frozen"),
ByteDictError::ExplicitOrderingHasDuplicates => {
write!(f, "explicit ordering contains duplicate values")
}
ByteDictError::OtherCodeNotInDictionary { code } => {
write!(f, "MapToOther policy: other_code {code} not in dictionary")
}
}
}
}
impl std::error::Error for ByteDictError {}
#[derive(Debug, Clone)]
pub struct ByteDictionary {
pool: ByteStringPool,
lookup: BTreeMap<Vec<u8>, u64>,
code_to_view: Vec<ByteStrView>,
frozen: bool,
ordering: CategoryOrdering,
dharht: Option<crate::detcoll::DHarht<u64>>,
hash_index: Option<crate::detcoll::SealedU64Map<u64>>,
}
impl ByteDictionary {
pub fn new() -> Self {
Self::with_ordering(CategoryOrdering::FirstSeen)
}
pub fn with_ordering(ordering: CategoryOrdering) -> Self {
let mut dict = ByteDictionary {
pool: ByteStringPool::new(),
lookup: BTreeMap::new(),
code_to_view: Vec::new(),
frozen: false,
ordering: ordering.clone(),
dharht: None,
hash_index: None,
};
if let CategoryOrdering::Explicit(values) = ordering {
for v in values {
let _ = dict.intern_internal(&v, false);
}
}
dict
}
pub fn from_explicit(values: Vec<Vec<u8>>) -> Result<Self, ByteDictError> {
let mut sorted = values.clone();
sorted.sort();
for w in sorted.windows(2) {
if w[0] == w[1] {
return Err(ByteDictError::ExplicitOrderingHasDuplicates);
}
}
Ok(Self::with_ordering(CategoryOrdering::Explicit(values)))
}
pub fn len(&self) -> usize {
self.code_to_view.len()
}
pub fn is_empty(&self) -> bool {
self.code_to_view.is_empty()
}
pub fn is_frozen(&self) -> bool {
self.frozen
}
pub fn ordering(&self) -> &CategoryOrdering {
&self.ordering
}
pub fn freeze(&mut self) {
self.frozen = true;
}
pub fn pool(&self) -> &ByteStringPool {
&self.pool
}
pub fn get(&self, code: u64) -> Option<&[u8]> {
let idx = usize::try_from(code).ok()?;
let view = *self.code_to_view.get(idx)?;
Some(self.pool.get(view))
}
pub fn lookup(&self, bytes: &[u8]) -> Option<u64> {
if let Some(d) = &self.dharht {
return d.get_bytes(bytes).copied();
}
self.lookup.get(bytes).copied()
}
pub fn seal_for_lookup(&mut self) {
let mut d = crate::detcoll::DHarht::new();
for (k, &code) in &self.lookup {
d.insert_bytes(k.as_slice(), code);
}
d.seal_for_lookup();
self.dharht = Some(d);
}
pub fn is_lookup_sealed(&self) -> bool {
self.dharht.is_some()
}
pub fn dharht_overflow_count(&self) -> u64 {
self.dharht.as_ref().map(|d| d.overflow_count()).unwrap_or(0)
}
pub fn seal_with_u64_hash_index(&mut self) {
let mut idx = crate::detcoll::SealedU64Map::new();
for (k, &code) in &self.lookup {
let h = crate::detcoll::hash_bytes(k.as_slice());
idx.insert(h, code);
}
idx.seal();
self.hash_index = Some(idx);
}
pub fn is_hash_indexed(&self) -> bool {
self.hash_index.is_some()
}
pub fn lookup_by_hash(&self, hash: u64) -> Option<u64> {
self.hash_index.as_ref()?.get(hash).copied()
}
pub fn lookup_by_hash_verify(&self, hash: u64, bytes: &[u8]) -> Option<u64> {
let code = self.lookup_by_hash(hash)?;
let stored = self.get(code)?;
if stored == bytes { Some(code) } else { None }
}
pub fn intern(&mut self, bytes: &[u8]) -> Result<u64, ByteDictError> {
self.intern_internal(bytes, true)
}
pub fn intern_with_policy(
&mut self,
bytes: &[u8],
policy: &UnknownCategoryPolicy,
) -> Result<InternedCode, ByteDictError> {
if let Some(c) = self.lookup.get(bytes) {
return Ok(InternedCode::Code(*c));
}
if !self.frozen {
return self.intern_internal(bytes, true).map(InternedCode::Code);
}
match policy {
UnknownCategoryPolicy::Error => Err(ByteDictError::UnknownCategory),
UnknownCategoryPolicy::MapToNull => Ok(InternedCode::Null),
UnknownCategoryPolicy::MapToOther { other_code } => {
if (*other_code as usize) < self.code_to_view.len() {
Ok(InternedCode::Code(*other_code))
} else {
Err(ByteDictError::OtherCodeNotInDictionary { code: *other_code })
}
}
UnknownCategoryPolicy::ExtendDictionary => {
self.intern_internal(bytes, false).map(InternedCode::Code)
}
}
}
fn intern_internal(
&mut self,
bytes: &[u8],
respect_frozen: bool,
) -> Result<u64, ByteDictError> {
if let Some(&c) = self.lookup.get(bytes) {
return Ok(c);
}
if respect_frozen && self.frozen {
return Err(ByteDictError::Frozen);
}
if let CategoryOrdering::Explicit(_) = &self.ordering {
if respect_frozen {
return Err(ByteDictError::UnknownCategory);
}
}
let view = self.pool.push(bytes)?;
let code = self.code_to_view.len() as u64;
self.code_to_view.push(view);
self.lookup.insert(bytes.to_vec(), code);
Ok(code)
}
pub fn seal_lexical(&mut self) -> Vec<u64> {
if !matches!(self.ordering, CategoryOrdering::Lexical) {
return (0..self.code_to_view.len() as u64).collect();
}
let mut entries: Vec<(Vec<u8>, u64)> = self
.lookup
.iter()
.map(|(k, &v)| (k.clone(), v))
.collect();
entries.sort_by(|a, b| a.0.cmp(&b.0));
let mut perm = vec![0u64; self.code_to_view.len()];
let mut new_pool = ByteStringPool::new();
let mut new_code_to_view: Vec<ByteStrView> = Vec::with_capacity(entries.len());
let mut new_lookup: BTreeMap<Vec<u8>, u64> = BTreeMap::new();
for (new_code, (bytes, old_code)) in entries.into_iter().enumerate() {
let view = new_pool
.push(&bytes)
.expect("seal_lexical: re-push cannot exceed original pool size");
new_code_to_view.push(view);
new_lookup.insert(bytes, new_code as u64);
perm[old_code as usize] = new_code as u64;
}
self.pool = new_pool;
self.code_to_view = new_code_to_view;
self.lookup = new_lookup;
perm
}
pub fn iter(&self) -> impl Iterator<Item = (u64, &[u8])> + '_ {
self.code_to_view.iter().enumerate().map(move |(i, &v)| {
(i as u64, self.pool.get(v))
})
}
}
impl Default for ByteDictionary {
fn default() -> Self {
Self::new()
}
}
#[derive(Debug, Clone)]
pub struct CategoricalColumn {
codes: AdaptiveCodes,
dictionary: ByteDictionary,
nulls: Option<BitMask>,
}
impl CategoricalColumn {
pub fn new() -> Self {
Self {
codes: AdaptiveCodes::new(),
dictionary: ByteDictionary::new(),
nulls: None,
}
}
pub fn with_dictionary(dictionary: ByteDictionary) -> Self {
Self {
codes: AdaptiveCodes::new(),
dictionary,
nulls: None,
}
}
pub fn len(&self) -> usize {
self.codes.len()
}
pub fn is_empty(&self) -> bool {
self.codes.is_empty()
}
pub fn dictionary(&self) -> &ByteDictionary {
&self.dictionary
}
pub fn codes(&self) -> &AdaptiveCodes {
&self.codes
}
pub fn nulls(&self) -> Option<&BitMask> {
self.nulls.as_ref()
}
pub fn is_null(&self, i: usize) -> bool {
match &self.nulls {
None => false,
Some(b) => !b.get(i),
}
}
pub fn push(&mut self, bytes: &[u8]) -> Result<u64, ByteDictError> {
let code = self.dictionary.intern(bytes)?;
self.codes.push(code);
if let Some(b) = &mut self.nulls {
*b = bitmask_with_extra_valid(b);
}
Ok(code)
}
pub fn push_null(&mut self) {
self.codes.push(0);
match &mut self.nulls {
None => {
let n = self.codes.len();
let mut words = vec![0u64; (n + 63) / 64];
for i in 0..(n - 1) {
words[i / 64] |= 1u64 << (i % 64);
}
self.nulls =
Some(BitMask::from_words_for_test(words, n));
}
Some(b) => {
*b = bitmask_with_extra_invalid(b);
}
}
}
pub fn push_with_policy(
&mut self,
bytes: &[u8],
policy: &UnknownCategoryPolicy,
) -> Result<(), ByteDictError> {
match self.dictionary.intern_with_policy(bytes, policy)? {
InternedCode::Code(c) => {
self.codes.push(c);
if let Some(b) = &mut self.nulls {
*b = bitmask_with_extra_valid(b);
}
}
InternedCode::Null => {
self.push_null();
}
}
Ok(())
}
pub fn get(&self, i: usize) -> Option<&[u8]> {
if self.is_null(i) {
return None;
}
let code = self.codes.get(i);
self.dictionary.get(code)
}
pub fn iter(&self) -> impl Iterator<Item = (usize, Option<&[u8]>)> + '_ {
(0..self.len()).map(move |i| (i, self.get(i)))
}
pub fn seal_lexical(&mut self) {
if !matches!(self.dictionary.ordering(), CategoryOrdering::Lexical) {
return;
}
let perm = self.dictionary.seal_lexical();
let mut new_codes = AdaptiveCodes::with_capacity(self.codes.len());
for c in self.codes.iter() {
new_codes.push(perm[c as usize]);
}
self.codes = new_codes;
}
pub fn profile(&self) -> CategoricalProfile {
let cardinality = self.dictionary.len();
let nrows = self.len();
let missing = match &self.nulls {
None => 0,
Some(b) => nrows - b.count_ones(),
};
let bytes_used = self.dictionary.pool().byte_size();
let avg_byte_len = if cardinality == 0 {
0
} else {
bytes_used / cardinality
};
let mut max_byte_len = 0usize;
for (_, b) in self.dictionary.iter() {
if b.len() > max_byte_len {
max_byte_len = b.len();
}
}
let unique_ratio_thousandths = if nrows == 0 {
0
} else {
(cardinality as u64).saturating_mul(1000) / nrows as u64
};
CategoricalProfile {
nrows,
cardinality,
missing,
bytes_used,
avg_byte_len,
max_byte_len,
code_width_bytes: self.codes.width_bytes(),
unique_ratio_thousandths,
}
}
}
impl Default for CategoricalColumn {
fn default() -> Self {
Self::new()
}
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub struct CategoricalProfile {
pub nrows: usize,
pub cardinality: usize,
pub missing: usize,
pub bytes_used: usize,
pub avg_byte_len: usize,
pub max_byte_len: usize,
pub code_width_bytes: usize,
pub unique_ratio_thousandths: u64,
}
fn bitmask_with_extra_valid(b: &BitMask) -> BitMask {
let n = b.nrows() + 1;
let mut words: Vec<u64> = b.words_slice().to_vec();
let needed = (n + 63) / 64;
while words.len() < needed {
words.push(0);
}
let i = n - 1;
words[i / 64] |= 1u64 << (i % 64);
BitMask::from_words_for_test(words, n)
}
fn bitmask_with_extra_invalid(b: &BitMask) -> BitMask {
let n = b.nrows() + 1;
let mut words: Vec<u64> = b.words_slice().to_vec();
let needed = (n + 63) / 64;
while words.len() < needed {
words.push(0);
}
BitMask::from_words_for_test(words, n)
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn pool_round_trip_simple() {
let mut p = ByteStringPool::new();
let v_hello = p.push(b"hello").unwrap();
let v_world = p.push(b"world").unwrap();
assert_eq!(p.get(v_hello), b"hello");
assert_eq!(p.get(v_world), b"world");
assert_eq!(p.len(), 2);
assert_eq!(p.byte_size(), 10);
}
#[test]
fn pool_round_trip_empty_strings() {
let mut p = ByteStringPool::new();
let v_empty = p.push(b"").unwrap();
let v_x = p.push(b"x").unwrap();
let v_empty2 = p.push(b"").unwrap();
assert!(p.get(v_empty).is_empty());
assert_eq!(p.get(v_x), b"x");
assert!(p.get(v_empty2).is_empty());
assert_eq!(p.len(), 3);
}
#[test]
fn pool_round_trip_embedded_nul_and_high_bytes() {
let mut p = ByteStringPool::new();
let payload: &[u8] = &[0u8, 1, 2, 0, 0xff, 0xfe, b'a'];
let v = p.push(payload).unwrap();
assert_eq!(p.get(v), payload);
}
#[test]
fn pool_get_by_index_matches_views() {
let mut p = ByteStringPool::new();
p.push(b"a").unwrap();
p.push(b"bb").unwrap();
p.push(b"ccc").unwrap();
assert_eq!(p.get_by_index(0).unwrap(), b"a");
assert_eq!(p.get_by_index(1).unwrap(), b"bb");
assert_eq!(p.get_by_index(2).unwrap(), b"ccc");
assert!(p.get_by_index(3).is_none());
}
#[test]
fn pool_iter_is_insertion_order() {
let mut p = ByteStringPool::new();
p.push(b"z").unwrap();
p.push(b"a").unwrap();
p.push(b"m").unwrap();
let collected: Vec<&[u8]> = p.iter().map(|(_, b)| b).collect();
assert_eq!(collected, vec![b"z" as &[u8], b"a", b"m"]);
}
#[test]
fn adaptive_codes_starts_u8() {
let c = AdaptiveCodes::new();
assert!(matches!(c, AdaptiveCodes::U8(_)));
assert_eq!(c.width_bytes(), 1);
}
#[test]
fn adaptive_codes_stays_u8_at_255() {
let mut c = AdaptiveCodes::new();
for i in 0u64..=255 {
c.push(i);
}
assert!(matches!(c, AdaptiveCodes::U8(_)));
assert_eq!(c.len(), 256);
for i in 0u64..=255 {
assert_eq!(c.get(i as usize), i);
}
}
#[test]
fn adaptive_codes_promotes_u8_to_u16_at_256() {
let mut c = AdaptiveCodes::new();
for i in 0u64..=255 {
c.push(i);
}
assert!(matches!(c, AdaptiveCodes::U8(_)));
c.push(256);
assert!(matches!(c, AdaptiveCodes::U16(_)));
for i in 0u64..=255 {
assert_eq!(c.get(i as usize), i);
}
assert_eq!(c.get(256), 256);
}
#[test]
fn adaptive_codes_promotes_u16_to_u32_at_65536() {
let mut c = AdaptiveCodes::U16(vec![0u16, 1, 2, 65_535]);
c.push(65_536);
assert!(matches!(c, AdaptiveCodes::U32(_)));
assert_eq!(c.get(0), 0);
assert_eq!(c.get(3), 65_535);
assert_eq!(c.get(4), 65_536);
}
#[test]
fn adaptive_codes_promotes_u32_to_u64_above_4b() {
let mut c = AdaptiveCodes::U32(vec![0u32, 1, u32::MAX]);
let huge = (u32::MAX as u64) + 1;
c.push(huge);
assert!(matches!(c, AdaptiveCodes::U64(_)));
assert_eq!(c.get(0), 0);
assert_eq!(c.get(2), u32::MAX as u64);
assert_eq!(c.get(3), huge);
}
#[test]
fn adaptive_codes_iter_matches_get() {
let mut c = AdaptiveCodes::new();
for i in 0u64..10 {
c.push(i * 7);
}
let via_iter: Vec<u64> = c.iter().collect();
let via_get: Vec<u64> = (0..c.len()).map(|i| c.get(i)).collect();
assert_eq!(via_iter, via_get);
}
#[test]
fn dict_intern_assigns_sequential_codes() {
let mut d = ByteDictionary::new();
assert_eq!(d.intern(b"red").unwrap(), 0);
assert_eq!(d.intern(b"green").unwrap(), 1);
assert_eq!(d.intern(b"blue").unwrap(), 2);
assert_eq!(d.intern(b"red").unwrap(), 0);
assert_eq!(d.intern(b"green").unwrap(), 1);
assert_eq!(d.len(), 3);
}
#[test]
fn dict_get_round_trips_bytes() {
let mut d = ByteDictionary::new();
d.intern(b"red").unwrap();
d.intern(b"green").unwrap();
d.intern(b"blue").unwrap();
assert_eq!(d.get(0).unwrap(), b"red");
assert_eq!(d.get(1).unwrap(), b"green");
assert_eq!(d.get(2).unwrap(), b"blue");
assert!(d.get(99).is_none());
}
#[test]
fn dict_lookup_does_not_extend() {
let mut d = ByteDictionary::new();
d.intern(b"red").unwrap();
assert_eq!(d.lookup(b"red"), Some(0));
assert_eq!(d.lookup(b"green"), None);
assert_eq!(d.len(), 1);
}
#[test]
fn dict_first_seen_is_insertion_order() {
let mut d = ByteDictionary::new();
assert_eq!(d.intern(b"B").unwrap(), 0);
assert_eq!(d.intern(b"A").unwrap(), 1);
assert_eq!(d.intern(b"C").unwrap(), 2);
assert_eq!(d.get(0).unwrap(), b"B");
assert_eq!(d.get(1).unwrap(), b"A");
assert_eq!(d.get(2).unwrap(), b"C");
}
#[test]
fn dict_lexical_seal_reorders_by_bytes() {
let mut d = ByteDictionary::with_ordering(CategoryOrdering::Lexical);
d.intern(b"banana").unwrap();
d.intern(b"apple").unwrap();
d.intern(b"cherry").unwrap();
let perm = d.seal_lexical();
assert_eq!(d.get(0).unwrap(), b"apple");
assert_eq!(d.get(1).unwrap(), b"banana");
assert_eq!(d.get(2).unwrap(), b"cherry");
assert_eq!(perm, vec![1, 0, 2]);
}
#[test]
fn dict_lexical_two_insertion_orders_seal_to_same_codes() {
let mut d1 = ByteDictionary::with_ordering(CategoryOrdering::Lexical);
let mut d2 = ByteDictionary::with_ordering(CategoryOrdering::Lexical);
d1.intern(b"banana").unwrap();
d1.intern(b"apple").unwrap();
d1.intern(b"cherry").unwrap();
d2.intern(b"cherry").unwrap();
d2.intern(b"banana").unwrap();
d2.intern(b"apple").unwrap();
d1.seal_lexical();
d2.seal_lexical();
for code in 0u64..3 {
assert_eq!(d1.get(code), d2.get(code), "lex ordering diverges at {code}");
}
}
#[test]
fn dict_explicit_pre_populates_codes() {
let d = ByteDictionary::from_explicit(vec![
b"low".to_vec(),
b"med".to_vec(),
b"high".to_vec(),
])
.unwrap();
assert_eq!(d.lookup(b"low"), Some(0));
assert_eq!(d.lookup(b"med"), Some(1));
assert_eq!(d.lookup(b"high"), Some(2));
}
#[test]
fn dict_explicit_rejects_duplicates() {
let err = ByteDictionary::from_explicit(vec![
b"a".to_vec(),
b"b".to_vec(),
b"a".to_vec(),
]);
assert!(matches!(err, Err(ByteDictError::ExplicitOrderingHasDuplicates)));
}
#[test]
fn dict_explicit_rejects_unknown_intern_when_frozen() {
let mut d = ByteDictionary::from_explicit(vec![b"a".to_vec(), b"b".to_vec()]).unwrap();
d.freeze();
let res = d.intern(b"c");
assert!(matches!(res, Err(ByteDictError::Frozen)));
}
#[test]
fn dict_frozen_intern_errors() {
let mut d = ByteDictionary::new();
d.intern(b"a").unwrap();
d.freeze();
let err = d.intern(b"b");
assert!(matches!(err, Err(ByteDictError::Frozen)));
}
#[test]
fn policy_error_returns_unknown_category() {
let mut d = ByteDictionary::new();
d.intern(b"a").unwrap();
d.freeze();
let res = d.intern_with_policy(b"b", &UnknownCategoryPolicy::Error);
assert!(matches!(res, Err(ByteDictError::UnknownCategory)));
}
#[test]
fn policy_map_to_null_returns_null() {
let mut d = ByteDictionary::new();
d.intern(b"a").unwrap();
d.freeze();
let res = d.intern_with_policy(b"b", &UnknownCategoryPolicy::MapToNull);
assert_eq!(res, Ok(InternedCode::Null));
}
#[test]
fn policy_map_to_other_returns_other_code() {
let mut d = ByteDictionary::new();
let _a = d.intern(b"a").unwrap();
let other = d.intern(b"Other").unwrap();
d.freeze();
let res = d.intern_with_policy(
b"unseen",
&UnknownCategoryPolicy::MapToOther { other_code: other },
);
assert_eq!(res, Ok(InternedCode::Code(other)));
}
#[test]
fn policy_map_to_other_rejects_invalid_other_code() {
let mut d = ByteDictionary::new();
d.intern(b"a").unwrap();
d.freeze();
let res = d.intern_with_policy(
b"x",
&UnknownCategoryPolicy::MapToOther { other_code: 99 },
);
assert!(matches!(
res,
Err(ByteDictError::OtherCodeNotInDictionary { code: 99 })
));
}
#[test]
fn policy_extend_dictionary_bypasses_frozen() {
let mut d = ByteDictionary::new();
d.intern(b"a").unwrap();
d.freeze();
let res = d
.intern_with_policy(b"b", &UnknownCategoryPolicy::ExtendDictionary)
.unwrap();
assert_eq!(res, InternedCode::Code(1));
assert_eq!(d.len(), 2);
assert!(d.is_frozen());
let strict = d.intern_with_policy(b"c", &UnknownCategoryPolicy::Error);
assert!(matches!(strict, Err(ByteDictError::UnknownCategory)));
}
#[test]
fn col_push_round_trip() {
let mut c = CategoricalColumn::new();
c.push(b"red").unwrap();
c.push(b"blue").unwrap();
c.push(b"red").unwrap();
assert_eq!(c.len(), 3);
assert_eq!(c.get(0).unwrap(), b"red");
assert_eq!(c.get(1).unwrap(), b"blue");
assert_eq!(c.get(2).unwrap(), b"red");
assert_eq!(c.dictionary().len(), 2);
}
#[test]
fn col_push_null_is_observed() {
let mut c = CategoricalColumn::new();
c.push(b"red").unwrap();
c.push_null();
c.push(b"blue").unwrap();
assert_eq!(c.len(), 3);
assert!(!c.is_null(0));
assert!(c.is_null(1));
assert!(!c.is_null(2));
assert_eq!(c.get(0).unwrap(), b"red");
assert!(c.get(1).is_none());
assert_eq!(c.get(2).unwrap(), b"blue");
}
#[test]
fn col_seal_lexical_remaps_codes() {
let mut c =
CategoricalColumn::with_dictionary(ByteDictionary::with_ordering(CategoryOrdering::Lexical));
c.push(b"banana").unwrap();
c.push(b"apple").unwrap();
c.push(b"banana").unwrap();
c.push(b"cherry").unwrap();
c.seal_lexical();
assert_eq!(c.codes().get(0), 1, "row0 was banana → seal to 1");
assert_eq!(c.codes().get(1), 0, "row1 was apple → seal to 0");
assert_eq!(c.codes().get(2), 1, "row2 was banana → seal to 1");
assert_eq!(c.codes().get(3), 2, "row3 was cherry → seal to 2");
assert_eq!(c.get(0).unwrap(), b"banana");
assert_eq!(c.get(1).unwrap(), b"apple");
assert_eq!(c.get(2).unwrap(), b"banana");
assert_eq!(c.get(3).unwrap(), b"cherry");
}
#[test]
fn col_promotion_at_256_distinct() {
let mut c = CategoricalColumn::new();
for i in 0u32..257 {
let s = format!("v{}", i);
c.push(s.as_bytes()).unwrap();
}
assert_eq!(c.dictionary().len(), 257);
assert!(matches!(
c.codes(),
AdaptiveCodes::U16(_) | AdaptiveCodes::U32(_) | AdaptiveCodes::U64(_)
));
for i in 0u32..257 {
let s = format!("v{}", i);
assert_eq!(c.get(i as usize).unwrap(), s.as_bytes());
}
}
#[test]
fn col_push_with_policy_null_records_null() {
let mut c = CategoricalColumn::new();
c.push(b"a").unwrap();
c.push(b"b").unwrap();
let mut dict = ByteDictionary::new();
dict.intern(b"a").unwrap();
dict.intern(b"b").unwrap();
dict.freeze();
let mut c2 = CategoricalColumn::with_dictionary(dict);
c2.push_with_policy(b"a", &UnknownCategoryPolicy::Error).unwrap();
c2.push_with_policy(b"unseen", &UnknownCategoryPolicy::MapToNull)
.unwrap();
c2.push_with_policy(b"b", &UnknownCategoryPolicy::Error).unwrap();
assert_eq!(c2.len(), 3);
assert!(!c2.is_null(0));
assert!(c2.is_null(1));
assert!(!c2.is_null(2));
}
#[test]
fn profile_reports_basic_stats() {
let mut c = CategoricalColumn::new();
c.push(b"red").unwrap();
c.push(b"blue").unwrap();
c.push_null();
c.push(b"red").unwrap();
let p = c.profile();
assert_eq!(p.nrows, 4);
assert_eq!(p.cardinality, 2);
assert_eq!(p.missing, 1);
assert_eq!(p.bytes_used, 7); assert_eq!(p.unique_ratio_thousandths, 500);
assert_eq!(p.code_width_bytes, 1);
}
#[test]
fn determinism_same_input_first_seen_two_dicts() {
let inputs: &[&[u8]] = &[b"alpha", b"beta", b"alpha", b"gamma", b"beta", b"delta"];
let mut d1 = ByteDictionary::new();
let mut d2 = ByteDictionary::new();
let codes1: Vec<u64> = inputs.iter().map(|b| d1.intern(b).unwrap()).collect();
let codes2: Vec<u64> = inputs.iter().map(|b| d2.intern(b).unwrap()).collect();
assert_eq!(codes1, codes2, "FirstSeen must be deterministic");
assert_eq!(codes1, vec![0, 1, 0, 2, 1, 3]);
}
#[test]
fn determinism_lexical_two_permutations_seal_identical() {
let perm1: &[&[u8]] = &[b"alpha", b"beta", b"gamma", b"delta"];
let perm2: &[&[u8]] = &[b"delta", b"gamma", b"beta", b"alpha"];
let mut d1 = ByteDictionary::with_ordering(CategoryOrdering::Lexical);
let mut d2 = ByteDictionary::with_ordering(CategoryOrdering::Lexical);
for s in perm1 {
d1.intern(s).unwrap();
}
for s in perm2 {
d2.intern(s).unwrap();
}
d1.seal_lexical();
d2.seal_lexical();
for code in 0u64..4 {
assert_eq!(d1.get(code), d2.get(code), "code {code}");
}
}
}