use std::collections::HashMap;
const NULL_SENTINEL: u16 = u16::MAX;
const MAX_DICT_SIZE: u16 = 65000;
pub fn encode_strings(values: &[Option<String>]) -> (Vec<u8>, StringEncoding) {
if values.is_empty() {
return (Vec::new(), StringEncoding::Dictionary);
}
let mut dict: HashMap<String, u16> = HashMap::new();
let mut dict_entries: Vec<String> = Vec::new();
for s in values.iter().flatten() {
if !dict.contains_key(s) {
if dict.len() >= MAX_DICT_SIZE as usize {
return encode_strings_raw(values);
}
let idx = dict_entries.len() as u16;
dict.insert(s.clone(), idx);
dict_entries.push(s.clone());
}
}
let mut buf = Vec::new();
buf.extend_from_slice(&(dict_entries.len() as u32).to_le_bytes());
for entry in &dict_entries {
let bytes = entry.as_bytes();
buf.extend_from_slice(&(bytes.len() as u16).to_le_bytes());
buf.extend_from_slice(bytes);
}
for val in values {
match val {
None => buf.extend_from_slice(&NULL_SENTINEL.to_le_bytes()),
Some(s) => {
let idx = dict[s];
buf.extend_from_slice(&idx.to_le_bytes());
}
}
}
(buf, StringEncoding::Dictionary)
}
fn encode_strings_raw(values: &[Option<String>]) -> (Vec<u8>, StringEncoding) {
let mut buf = Vec::new();
for val in values {
match val {
None => buf.push(0),
Some(s) => {
buf.push(1);
let bytes = s.as_bytes();
buf.extend_from_slice(&(bytes.len() as u16).to_le_bytes());
buf.extend_from_slice(bytes);
}
}
}
(buf, StringEncoding::Raw)
}
pub fn decode_strings(data: &[u8], count: usize, encoding: StringEncoding) -> Vec<Option<String>> {
if count == 0 {
return Vec::new();
}
match encoding {
StringEncoding::Dictionary => decode_strings_dictionary(data, count),
StringEncoding::Raw => decode_strings_raw(data, count),
}
}
fn decode_strings_dictionary(data: &[u8], count: usize) -> Vec<Option<String>> {
let mut cursor = 0usize;
let dict_len = u32::from_le_bytes(data[cursor..cursor + 4].try_into().unwrap()) as usize;
cursor += 4;
let mut dict_entries: Vec<String> = Vec::with_capacity(dict_len);
for _ in 0..dict_len {
let len = u16::from_le_bytes(data[cursor..cursor + 2].try_into().unwrap()) as usize;
cursor += 2;
let s = String::from_utf8_lossy(&data[cursor..cursor + len]).to_string();
cursor += len;
dict_entries.push(s);
}
let mut result = Vec::with_capacity(count);
for _ in 0..count {
let idx = u16::from_le_bytes(data[cursor..cursor + 2].try_into().unwrap());
cursor += 2;
if idx == NULL_SENTINEL {
result.push(None);
} else {
result.push(Some(dict_entries[idx as usize].clone()));
}
}
result
}
fn decode_strings_raw(data: &[u8], count: usize) -> Vec<Option<String>> {
let mut cursor = 0usize;
let mut result = Vec::with_capacity(count);
for _ in 0..count {
let is_null = data[cursor] == 0;
cursor += 1;
if is_null {
result.push(None);
} else {
let len = u16::from_le_bytes(data[cursor..cursor + 2].try_into().unwrap()) as usize;
cursor += 2;
let s = String::from_utf8_lossy(&data[cursor..cursor + len]).to_string();
cursor += len;
result.push(Some(s));
}
}
result
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
#[repr(u8)]
pub enum StringEncoding {
Dictionary = 0,
Raw = 1,
}
pub fn encode_bools(values: &[Option<bool>]) -> (Vec<u8>, Option<Vec<u8>>) {
let n = values.len();
let packed_bytes = n.div_ceil(8);
let mut packed = vec![0u8; packed_bytes];
let has_nulls = values.iter().any(|v| v.is_none());
let null_bitmap = if has_nulls {
let mut bm = vec![0u8; packed_bytes];
for (i, val) in values.iter().enumerate() {
if val.is_none() {
bm[i / 8] |= 1 << (i % 8);
}
}
Some(bm)
} else {
None
};
for (i, val) in values.iter().enumerate() {
match val {
Some(true) => packed[i / 8] |= 1 << (i % 8),
Some(false) => {} None => {} }
}
(packed, null_bitmap)
}
pub fn decode_bools(packed: &[u8], null_bitmap: Option<&[u8]>, count: usize) -> Vec<Option<bool>> {
let mut result = Vec::with_capacity(count);
for i in 0..count {
let is_null = null_bitmap.is_some_and(|bm| (bm[i / 8] >> (i % 8)) & 1 == 1);
if is_null {
result.push(None);
} else {
let val = (packed[i / 8] >> (i % 8)) & 1 == 1;
result.push(Some(val));
}
}
result
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_string_dictionary_roundtrip() {
let values = vec![
Some("hello".to_string()),
Some("world".to_string()),
None,
Some("hello".to_string()), Some("foo".to_string()),
None,
];
let (encoded, enc_type) = encode_strings(&values);
assert_eq!(enc_type, StringEncoding::Dictionary);
let decoded = decode_strings(&encoded, values.len(), enc_type);
assert_eq!(decoded, values);
}
#[test]
fn test_string_all_nulls() {
let values: Vec<Option<String>> = vec![None, None, None];
let (encoded, enc_type) = encode_strings(&values);
let decoded = decode_strings(&encoded, values.len(), enc_type);
assert_eq!(decoded, values);
}
#[test]
fn test_string_empty() {
let values: Vec<Option<String>> = vec![];
let (encoded, enc_type) = encode_strings(&values);
assert!(encoded.is_empty());
let decoded = decode_strings(&encoded, 0, enc_type);
assert!(decoded.is_empty());
}
#[test]
fn test_string_single_value() {
let values = vec![Some("test".to_string())];
let (encoded, enc_type) = encode_strings(&values);
let decoded = decode_strings(&encoded, 1, enc_type);
assert_eq!(decoded, values);
}
#[test]
fn test_string_high_cardinality_fallback() {
let values: Vec<Option<String>> = (0..70000)
.map(|i| Some(format!("unique_string_{}", i)))
.collect();
let (_, enc_type) = encode_strings(&values);
assert_eq!(enc_type, StringEncoding::Raw);
}
#[test]
fn test_string_raw_roundtrip() {
let values: Vec<Option<String>> = (0..1000)
.map(|i| if i % 10 == 0 { None } else { Some(format!("val_{}", i)) })
.collect();
let (encoded, enc_type) = encode_strings(&values);
let decoded = decode_strings(&encoded, values.len(), enc_type);
assert_eq!(decoded, values);
}
#[test]
fn test_bool_roundtrip_no_nulls() {
let values: Vec<Option<bool>> = vec![Some(true), Some(false), Some(true), Some(true)];
let (packed, null_bm) = encode_bools(&values);
assert!(null_bm.is_none());
let decoded = decode_bools(&packed, null_bm.as_deref(), values.len());
assert_eq!(decoded, values);
}
#[test]
fn test_bool_roundtrip_with_nulls() {
let values: Vec<Option<bool>> = vec![Some(true), None, Some(false), None, Some(true)];
let (packed, null_bm) = encode_bools(&values);
assert!(null_bm.is_some());
let decoded = decode_bools(&packed, null_bm.as_deref(), values.len());
assert_eq!(decoded, values);
}
#[test]
fn test_bool_all_false() {
let values: Vec<Option<bool>> = vec![Some(false); 16];
let (packed, null_bm) = encode_bools(&values);
assert!(null_bm.is_none());
assert!(packed.iter().all(|&b| b == 0));
let decoded = decode_bools(&packed, null_bm.as_deref(), values.len());
assert_eq!(decoded, values);
}
#[test]
fn test_bool_all_true() {
let values: Vec<Option<bool>> = vec![Some(true); 16];
let (packed, null_bm) = encode_bools(&values);
assert_eq!(packed.len(), 2);
assert_eq!(packed[0], 0xFF);
assert_eq!(packed[1], 0xFF);
let decoded = decode_bools(&packed, null_bm.as_deref(), values.len());
assert_eq!(decoded, values);
}
#[test]
fn test_bool_large_count() {
let values: Vec<Option<bool>> = (0..1000)
.map(|i| if i % 7 == 0 { None } else { Some(i % 2 == 0) })
.collect();
let (packed, null_bm) = encode_bools(&values);
let decoded = decode_bools(&packed, null_bm.as_deref(), values.len());
assert_eq!(decoded, values);
}
}