use std::collections::HashMap;
pub const KORE_MAGIC: &[u8; 4] = b"KORE";
pub const KORE_V2: u8 = 2;
const HEADER_SIZE: usize = 64;
const DEFAULT_CHUNK_SIZE: usize = 65536;
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
#[repr(u8)]
pub enum KType { Int = 1, Float = 2, Bool = 3, Str = 4, Bytes = 5, Struct = 6, List = 7, Map = 8 }
impl KType {
fn from_u8(v: u8) -> Self {
match v { 1 => KType::Int, 2 => KType::Float, 3 => KType::Bool,
4 => KType::Str, 5 => KType::Bytes,
6 => KType::Struct, 7 => KType::List, 8 => KType::Map,
_ => KType::Str }
}
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
#[repr(u8)]
pub enum Codec {
Raw = 0,
RLE = 1,
Delta = 2,
DictRLE= 3,
Bitpack= 4,
BDict = 5, CDelta = 6, FOR = 7, HuffDict=8, Derived=9, }
impl Codec {
fn from_u8(v: u8) -> Self {
match v { 0=>Codec::Raw, 1=>Codec::RLE, 2=>Codec::Delta, 3=>Codec::DictRLE,
4=>Codec::Bitpack, 5=>Codec::BDict, 6=>Codec::CDelta, 7=>Codec::FOR,
8=>Codec::HuffDict, 9=>Codec::Derived, _ => Codec::Raw }
}
}
#[derive(Debug, Clone)]
pub enum KVal {
Int(i64),
Float(f64),
Bool(bool),
Str(String),
Bytes(Vec<u8>),
Null,
Struct(Vec<(String, KVal)>), List(Vec<KVal>), Map(Vec<(KVal, KVal)>), }
impl KVal {
#[inline] pub fn as_i64(&self) -> i64 { match self { KVal::Int(x) => *x, KVal::Float(f) => *f as i64, KVal::Bool(b) => *b as i64, _ => 0 } }
#[inline] pub fn as_f64(&self) -> f64 { match self { KVal::Float(x) => *x, KVal::Int(i) => *i as f64, _ => 0.0 } }
#[inline] pub fn as_str(&self) -> &str { match self { KVal::Str(s) => s.as_str(), _ => "" } }
#[inline] pub fn is_null(&self) -> bool { matches!(self, KVal::Null) }
pub fn display(&self) -> String {
match self {
KVal::Int(x) => x.to_string(),
KVal::Float(f) => { let s = format!("{:.8}", f); s.trim_end_matches('0').trim_end_matches('.').to_string() }
KVal::Bool(b) => b.to_string(),
KVal::Str(s) => s.clone(),
KVal::Bytes(b) => format!("<{} bytes>", b.len()),
KVal::Null => "null".to_string(),
KVal::Struct(fields) => {
let inner: Vec<String> = fields.iter().map(|(k, v)| format!("{}:{}", k, v.display())).collect();
format!("{{{}}}", inner.join(", "))
}
KVal::List(items) => {
let inner: Vec<String> = items.iter().map(|v| v.display()).collect();
format!("[{}]", inner.join(", "))
}
KVal::Map(pairs) => {
let inner: Vec<String> = pairs.iter().map(|(k, v)| format!("{}=>{}", k.display(), v.display())).collect();
format!("{{{}}}", inner.join(", "))
}
}
}
}
#[derive(Debug, Clone)]
pub struct KColumn {
pub name: String,
pub ktype: KType,
pub encrypted: bool,
pub enc_key: [u8; 32],
}
impl KColumn {
pub fn new(name: &str, ktype: KType) -> Self {
KColumn { name: name.to_string(), ktype, encrypted: false, enc_key: [0u8; 32] }
}
pub fn encrypted(name: &str, ktype: KType, key: [u8; 32]) -> Self {
KColumn { name: name.to_string(), ktype, encrypted: true, enc_key: key }
}
}
#[derive(Debug, Clone, Default)]
pub struct ColStats {
pub null_count: u32,
pub min_i64: i64,
pub max_i64: i64,
pub min_str: String,
pub max_str: String,
}
#[derive(Clone, Debug)]
pub struct ColMeta {
pub file_offset: u64,
pub comp_len: u32,
pub codec: u8,
}
pub struct KoreReader {
pub columns: Vec<KColumn>,
pub nrows: usize,
pub ncols: usize,
pub nchunks: usize,
pub chunk_nrows: Vec<usize>,
pub col_meta: Vec<Vec<ColMeta>>,
pub file_data: Vec<u8>,
pub dict: Vec<String>,
}
impl KoreReader {
pub fn read_all_columns(&self) -> Vec<Vec<KVal>> {
let mut cols: Vec<Vec<KVal>> = vec![Vec::with_capacity(self.nrows); self.ncols];
for chunk_idx in 0..self.nchunks {
let cnr = self.chunk_nrows[chunk_idx];
for (ci, col_vals) in cols.iter_mut().enumerate().take(self.ncols) {
let meta = &self.col_meta[chunk_idx][ci];
let vals = self.decode_col_block(ci, meta, cnr, chunk_idx);
col_vals.extend_from_slice(&vals);
}
}
for c in &mut cols { if c.len() < self.nrows { c.resize(self.nrows, KVal::Null); } }
cols
}
pub fn decode_col_block(&self, ci: usize, meta: &ColMeta, nrows: usize, chunk_idx: usize) -> Vec<KVal> {
let off = meta.file_offset as usize;
if off + 8 > self.file_data.len() { return vec![KVal::Null; nrows]; }
let checksum = u32::from_le_bytes(self.file_data[off..off+4].try_into().unwrap_or([0;4]));
let comp_len = u32::from_le_bytes(self.file_data[off+4..off+8].try_into().unwrap_or([0;4]));
let comp_start = off + 8;
let comp_end = comp_start.saturating_add(comp_len as usize).min(self.file_data.len());
if comp_start >= comp_end { return vec![KVal::Null; nrows]; }
let comp_slice = &self.file_data[comp_start..comp_end];
if crc32(comp_slice) != checksum { return vec![KVal::Null; nrows]; }
let decompressed = decompress_block(comp_slice);
let mut codec_data = decompressed;
if ci < self.columns.len() && self.columns[ci].encrypted {
let col = &self.columns[ci];
let nonce = derive_nonce(&col.name, chunk_idx);
codec_data = aes256_ctr(&codec_data, &col.enc_key, &nonce);
}
let codec = Codec::from_u8(meta.codec);
decode_column_data(&codec_data, &self.columns[ci], codec, nrows, &self.dict)
}
pub fn open(path: &str) -> Result<Self, String> {
let data = std::fs::read(path).map_err(|e| format!("Cannot read {}: {}", path, e))?;
if data.len() < 12 { return Err("File too small".to_string()); }
let len = data.len();
let footer_comp_len = u32::from_le_bytes(data[len-12..len-8].try_into().unwrap_or([0;4])) as usize;
let footer_offset = u64::from_le_bytes(data[len-8..len].try_into().unwrap_or([0;8])) as usize;
if data.len() < 64 || &data[0..4] != KORE_MAGIC { return Err("Not a KORE file".to_string()); }
if data[4] != KORE_V2 { return Err("Unsupported KORE version".to_string()); }
let ncols = u16::from_le_bytes(data[6..8].try_into().unwrap_or([0;2])) as usize;
let nrows = u64::from_le_bytes(data[8..16].try_into().unwrap_or([0;8])) as usize;
let mut p = HEADER_SIZE;
if p + 4 > data.len() { return Err("Truncated schema header".to_string()); }
let schema_comp_len = u32::from_le_bytes(data[p..p+4].try_into().unwrap_or([0;4])) as usize; p += 4;
let schema_comp_end = (p + schema_comp_len).min(data.len());
let schema_comp = &data[p..schema_comp_end]; p = schema_comp_end;
let schema_raw = decompress_block(schema_comp);
let mut cols: Vec<KColumn> = Vec::with_capacity(ncols);
let mut sp = 0usize;
while sp < schema_raw.len() {
let (namelen, np) = read_varint(&schema_raw, sp); sp = np;
let end = sp + namelen as usize;
let name = String::from_utf8_lossy(&schema_raw[sp..end.min(schema_raw.len())]).into_owned();
sp = end;
let ktype = KType::from_u8(schema_raw.get(sp).copied().unwrap_or(4)); sp += 1;
let enc = schema_raw.get(sp).copied().unwrap_or(0); sp += 1;
if enc != 0 { cols.push(KColumn::encrypted(&name, ktype, [0u8;32])); } else { cols.push(KColumn::new(&name, ktype)); }
}
if p + 4 > data.len() { return Err("Truncated dict header".to_string()); }
let dict_comp_len = u32::from_le_bytes(data[p..p+4].try_into().unwrap_or([0;4])) as usize; p += 4;
let dict_comp_end = (p + dict_comp_len).min(data.len());
let dict_comp = &data[p..dict_comp_end];
let dict_raw = decompress_block(dict_comp);
let mut dict: Vec<String> = Vec::new();
if !dict_raw.is_empty() {
let mut dp = 0usize;
let (nentries, ndp) = read_varint(&dict_raw, dp); dp = ndp;
for _ in 0..nentries {
let (elen, epp) = read_varint(&dict_raw, dp); dp = epp;
let eend = dp + elen as usize;
dict.push(String::from_utf8_lossy(&dict_raw[dp..eend.min(dict_raw.len())]).into_owned());
dp = eend;
}
}
if footer_offset + footer_comp_len > data.len() { return Err("Invalid footer offset".to_string()); }
let footer_comp = &data[footer_offset..footer_offset + footer_comp_len];
let footer_raw = decompress_block(footer_comp);
let mut fp = 0usize;
if fp + 6 > footer_raw.len() { return Err("Truncated footer".to_string()); }
let fnchunks = u32::from_le_bytes(footer_raw[fp..fp+4].try_into().unwrap_or([0;4])) as usize; fp += 4;
let fncols = u16::from_le_bytes(footer_raw[fp..fp+2].try_into().unwrap_or([0;2])) as usize; fp += 2;
let mut chunk_nrows: Vec<usize> = Vec::with_capacity(fnchunks);
for _ in 0..fnchunks {
if fp + 4 > footer_raw.len() { return Err("Truncated footer chunk rows".to_string()); }
let cr = u32::from_le_bytes(footer_raw[fp..fp+4].try_into().unwrap_or([0;4])) as usize; fp += 4;
chunk_nrows.push(cr);
}
let mut col_meta: Vec<Vec<ColMeta>> = Vec::with_capacity(fnchunks);
for _ in 0..fnchunks {
let mut cms: Vec<ColMeta> = Vec::with_capacity(fncols);
for _ in 0..fncols {
if fp + 8 + 4 + 1 > footer_raw.len() { return Err("Truncated footer meta".to_string()); }
let file_offset = u64::from_le_bytes(footer_raw[fp..fp+8].try_into().unwrap_or([0;8])); fp += 8;
let comp_len = u32::from_le_bytes(footer_raw[fp..fp+4].try_into().unwrap_or([0;4])); fp += 4;
let codec = footer_raw[fp]; fp += 1;
if fp + 4 > footer_raw.len() { return Err("Truncated footer stats".to_string()); }
fp += 4; let (_minv, p1) = read_zvar(&footer_raw, fp); fp = p1;
let (_maxv, p2) = read_zvar(&footer_raw, fp); fp = p2;
let (minlen, p3) = read_varint(&footer_raw, fp); fp = p3;
fp += minlen as usize;
let (maxlen, p4) = read_varint(&footer_raw, fp); fp = p4;
fp += maxlen as usize;
fp += 512;
cms.push(ColMeta { file_offset, comp_len, codec });
}
col_meta.push(cms);
}
Ok(KoreReader {
columns: cols,
nrows,
ncols,
nchunks: fnchunks,
chunk_nrows,
col_meta,
file_data: data,
dict,
})
}
}
#[rustfmt::skip]
const AES_SBOX: [u8; 256] = [
0x63,0x7c,0x77,0x7b,0xf2,0x6b,0x6f,0xc5,0x30,0x01,0x67,0x2b,0xfe,0xd7,0xab,0x76,
0xca,0x82,0xc9,0x7d,0xfa,0x59,0x47,0xf0,0xad,0xd4,0xa2,0xaf,0x9c,0xa4,0x72,0xc0,
0xb7,0xfd,0x93,0x26,0x36,0x3f,0xf7,0xcc,0x34,0xa5,0xe5,0xf1,0x71,0xd8,0x31,0x15,
0x04,0xc7,0x23,0xc3,0x18,0x96,0x05,0x9a,0x07,0x12,0x80,0xe2,0xeb,0x27,0xb2,0x75,
0x09,0x83,0x2c,0x1a,0x1b,0x6e,0x5a,0xa0,0x52,0x3b,0xd6,0xb3,0x29,0xe3,0x2f,0x84,
0x53,0xd1,0x00,0xed,0x20,0xfc,0xb1,0x5b,0x6a,0xcb,0xbe,0x39,0x4a,0x4c,0x58,0xcf,
0xd0,0xef,0xaa,0xfb,0x43,0x4d,0x33,0x85,0x45,0xf9,0x02,0x7f,0x50,0x3c,0x9f,0xa8,
0x51,0xa3,0x40,0x8f,0x92,0x9d,0x38,0xf5,0xbc,0xb6,0xda,0x21,0x10,0xff,0xf3,0xd2,
0xcd,0x0c,0x13,0xec,0x5f,0x97,0x44,0x17,0xc4,0xa7,0x7e,0x3d,0x64,0x5d,0x19,0x73,
0x60,0x81,0x4f,0xdc,0x22,0x2a,0x90,0x88,0x46,0xee,0xb8,0x14,0xde,0x5e,0x0b,0xdb,
0xe0,0x32,0x3a,0x0a,0x49,0x06,0x24,0x5c,0xc2,0xd3,0xac,0x62,0x91,0x95,0xe4,0x79,
0xe7,0xc8,0x37,0x6d,0x8d,0xd5,0x4e,0xa9,0x6c,0x56,0xf4,0xea,0x65,0x7a,0xae,0x08,
0xba,0x78,0x25,0x2e,0x1c,0xa6,0xb4,0xc6,0xe8,0xdd,0x74,0x1f,0x4b,0xbd,0x8b,0x8a,
0x70,0x3e,0xb5,0x66,0x48,0x03,0xf6,0x0e,0x61,0x35,0x57,0xb9,0x86,0xc1,0x1d,0x9e,
0xe1,0xf8,0x98,0x11,0x69,0xd9,0x8e,0x94,0x9b,0x1e,0x87,0xe9,0xce,0x55,0x28,0xdf,
0x8c,0xa1,0x89,0x0d,0xbf,0xe6,0x42,0x68,0x41,0x99,0x2d,0x0f,0xb0,0x54,0xbb,0x16,
];
const AES_RCON: [u8; 10] = [0x01,0x02,0x04,0x08,0x10,0x20,0x40,0x80,0x1b,0x36];
fn aes256_key_expand(key: &[u8; 32]) -> [u32; 60] {
let mut rk = [0u32; 60];
for i in 0..8 {
rk[i] = u32::from_be_bytes([key[4*i], key[4*i+1], key[4*i+2], key[4*i+3]]);
}
for i in 8..60 {
let mut t = rk[i - 1];
if i % 8 == 0 {
t = t.rotate_left(8);
let b = t.to_be_bytes();
t = u32::from_be_bytes([
AES_SBOX[b[0] as usize], AES_SBOX[b[1] as usize],
AES_SBOX[b[2] as usize], AES_SBOX[b[3] as usize],
]) ^ ((AES_RCON[i / 8 - 1] as u32) << 24);
} else if i % 8 == 4 {
let b = t.to_be_bytes();
t = u32::from_be_bytes([
AES_SBOX[b[0] as usize], AES_SBOX[b[1] as usize],
AES_SBOX[b[2] as usize], AES_SBOX[b[3] as usize],
]);
}
rk[i] = rk[i - 8] ^ t;
}
rk
}
#[inline]
fn gf_mul2(x: u8) -> u8 { if x & 0x80 != 0 { (x << 1) ^ 0x1b } else { x << 1 } }
#[inline]
fn gf_mul3(x: u8) -> u8 { gf_mul2(x) ^ x }
fn aes256_encrypt_block(block: &[u8; 16], rk: &[u32; 60]) -> [u8; 16] {
let mut s = [0u8; 16];
for i in 0..4 {
let k = rk[i].to_be_bytes();
s[4*i] = block[4*i] ^ k[0];
s[4*i+1] = block[4*i+1] ^ k[1];
s[4*i+2] = block[4*i+2] ^ k[2];
s[4*i+3] = block[4*i+3] ^ k[3];
}
for round in 1..14 {
let mut t = [0u8; 16];
for i in 0..16 { t[i] = AES_SBOX[s[i] as usize]; }
let sr = [
t[0],t[5],t[10],t[15], t[4],t[9],t[14],t[3],
t[8],t[13],t[2],t[7], t[12],t[1],t[6],t[11],
];
for c in 0..4 {
let i = c * 4;
let (a0,a1,a2,a3) = (sr[i],sr[i+1],sr[i+2],sr[i+3]);
s[i] = gf_mul2(a0) ^ gf_mul3(a1) ^ a2 ^ a3;
s[i+1] = a0 ^ gf_mul2(a1) ^ gf_mul3(a2) ^ a3;
s[i+2] = a0 ^ a1 ^ gf_mul2(a2) ^ gf_mul3(a3);
s[i+3] = gf_mul3(a0) ^ a1 ^ a2 ^ gf_mul2(a3);
}
for i in 0..4 {
let k = rk[round * 4 + i].to_be_bytes();
s[4*i] ^= k[0]; s[4*i+1] ^= k[1]; s[4*i+2] ^= k[2]; s[4*i+3] ^= k[3];
}
}
let mut t = [0u8; 16];
for i in 0..16 { t[i] = AES_SBOX[s[i] as usize]; }
let sr = [
t[0],t[5],t[10],t[15], t[4],t[9],t[14],t[3],
t[8],t[13],t[2],t[7], t[12],t[1],t[6],t[11],
];
let mut out = [0u8; 16];
for i in 0..4 {
let k = rk[56 + i].to_be_bytes();
out[4*i] = sr[4*i] ^ k[0];
out[4*i+1] = sr[4*i+1] ^ k[1];
out[4*i+2] = sr[4*i+2] ^ k[2];
out[4*i+3] = sr[4*i+3] ^ k[3];
}
out
}
pub fn aes256_ctr(data: &[u8], key: &[u8; 32], nonce: &[u8; 12]) -> Vec<u8> {
if key == &[0u8; 32] { return data.to_vec(); }
let rk = aes256_key_expand(key);
let mut out = Vec::with_capacity(data.len());
let mut counter = 0u32;
let mut pos = 0;
while pos < data.len() {
let mut block = [0u8; 16];
block[..12].copy_from_slice(nonce);
block[12..16].copy_from_slice(&counter.to_be_bytes());
let keystream = aes256_encrypt_block(&block, &rk);
let chunk_end = (pos + 16).min(data.len());
for i in pos..chunk_end {
out.push(data[i] ^ keystream[i - pos]);
}
pos = chunk_end;
counter += 1;
}
out
}
pub fn evolve_schema_read(
reader: &KoreReader,
target_schema: &[(String, KType)],
) -> Vec<Vec<KVal>> {
let cols = reader.read_all_columns();
let src_map: HashMap<&str, usize> = reader.columns.iter().enumerate()
.map(|(i, c)| (c.name.as_str(), i)).collect();
target_schema.iter().map(|(name, _ktype)| {
match src_map.get(name.as_str()) {
Some(&ci) if ci < cols.len() => cols[ci].clone(),
_ => vec![KVal::Null; reader.nrows],
}
}).collect()
}
impl KoreReader {
pub fn read_row(&self, row_idx: usize) -> Option<Vec<KVal>> {
if row_idx >= self.nrows { return None; }
let mut offset = 0;
for chunk_idx in 0..self.nchunks {
let cnr = self.chunk_nrows[chunk_idx];
if row_idx < offset + cnr {
let local_row = row_idx - offset;
let row: Vec<KVal> = (0..self.ncols).map(|ci| {
let meta = &self.col_meta[chunk_idx][ci];
let vals = self.decode_col_block(ci, meta, cnr, chunk_idx);
vals.into_iter().nth(local_row).unwrap_or(KVal::Null)
}).collect();
return Some(row);
}
offset += cnr;
}
None
}
pub fn read_row_range(&self, start: usize, end: usize) -> Vec<Vec<KVal>> {
let end = end.min(self.nrows);
if start >= end { return Vec::new(); }
let mut rows = Vec::with_capacity(end - start);
let mut offset = 0;
for chunk_idx in 0..self.nchunks {
let cnr = self.chunk_nrows[chunk_idx];
let chunk_start = offset;
let chunk_end = offset + cnr;
offset += cnr;
if chunk_end <= start { continue; }
if chunk_start >= end { break; }
let chunk_cols: Vec<Vec<KVal>> = (0..self.ncols).map(|ci| {
let meta = &self.col_meta[chunk_idx][ci];
self.decode_col_block(ci, meta, cnr, chunk_idx)
}).collect();
let local_start = start.saturating_sub(chunk_start);
let local_end = if end < chunk_end { end - chunk_start } else { cnr };
for ri in local_start..local_end {
let row: Vec<KVal> = chunk_cols.iter()
.map(|c| c.get(ri).cloned().unwrap_or(KVal::Null))
.collect();
rows.push(row);
}
}
rows
}
}
pub struct DeleteBitmap {
bits: Vec<u64>,
total_rows: usize,
deleted_count: usize,
}
impl DeleteBitmap {
pub fn new(total_rows: usize) -> Self {
let nwords = total_rows.div_ceil(64);
DeleteBitmap { bits: vec![0u64; nwords], total_rows, deleted_count: 0 }
}
pub fn delete_row(&mut self, idx: usize) {
if idx < self.total_rows {
let word = idx / 64;
let bit = idx % 64;
if self.bits[word] & (1u64 << bit) == 0 {
self.bits[word] |= 1u64 << bit;
self.deleted_count += 1;
}
}
}
pub fn is_deleted(&self, idx: usize) -> bool {
if idx >= self.total_rows { return true; }
let word = idx / 64;
let bit = idx % 64;
self.bits[word] & (1u64 << bit) != 0
}
pub fn active_count(&self) -> usize {
self.total_rows - self.deleted_count
}
pub fn save(&self, path: &str) -> Result<(), String> {
use std::io::Write;
let del_path = format!("{}.del", path);
let mut f = std::fs::File::create(&del_path)
.map_err(|e| format!("Cannot create {}: {}", del_path, e))?;
f.write_all(&(self.total_rows as u64).to_le_bytes()).map_err(|e| e.to_string())?;
f.write_all(&(self.deleted_count as u64).to_le_bytes()).map_err(|e| e.to_string())?;
for &w in &self.bits {
f.write_all(&w.to_le_bytes()).map_err(|e| e.to_string())?;
}
Ok(())
}
pub fn load(path: &str) -> Result<Self, String> {
let del_path = format!("{}.del", path);
let data = std::fs::read(&del_path)
.map_err(|e| format!("Cannot read {}: {}", del_path, e))?;
if data.len() < 16 { return Err("Delete bitmap too short".to_string()); }
let total_rows = u64::from_le_bytes(data[0..8].try_into().unwrap()) as usize;
let deleted_count = u64::from_le_bytes(data[8..16].try_into().unwrap()) as usize;
let nwords = total_rows.div_ceil(64);
let mut bits = vec![0u64; nwords];
for (i, b) in bits.iter_mut().enumerate().take(nwords) {
let off = 16 + i * 8;
if off + 8 <= data.len() {
*b = u64::from_le_bytes(data[off..off+8].try_into().unwrap());
}
}
Ok(DeleteBitmap { bits, total_rows, deleted_count })
}
}
#[inline]
#[allow(dead_code)]
fn delta_decode_simd_hint(deltas: &[i64], base: i64) -> Vec<i64> {
let n = deltas.len();
let mut out = Vec::with_capacity(n);
if n == 0 { return out; }
let mut acc = base;
let chunks = n / 4;
for c in 0..chunks {
let i = c * 4;
acc += deltas[i]; out.push(acc);
acc += deltas[i+1]; out.push(acc);
acc += deltas[i+2]; out.push(acc);
acc += deltas[i+3]; out.push(acc);
}
for &d in deltas.iter().skip(chunks * 4) {
acc += d;
out.push(acc);
}
out
}
#[inline]
#[allow(dead_code)]
fn crc32_simd_hint(data: &[u8]) -> u32 {
crc32(data)
}
fn crc32(data: &[u8]) -> u32 {
const fn make_table() -> [u32; 256] {
let mut t = [0u32; 256];
let mut i = 0;
while i < 256 {
let mut c = i as u32;
let mut k = 0;
while k < 8 { c = if c & 1 != 0 { 0xEDB8_8320 ^ (c >> 1) } else { c >> 1 }; k += 1; }
t[i] = c; i += 1;
}
t
}
const TABLE: [u32; 256] = make_table();
let mut crc = 0xFFFF_FFFFu32;
let chunks = data.chunks_exact(4);
let remainder = chunks.remainder();
for chunk in chunks {
crc = TABLE[((crc ^ chunk[0] as u32) & 0xFF) as usize] ^ (crc >> 8);
crc = TABLE[((crc ^ chunk[1] as u32) & 0xFF) as usize] ^ (crc >> 8);
crc = TABLE[((crc ^ chunk[2] as u32) & 0xFF) as usize] ^ (crc >> 8);
crc = TABLE[((crc ^ chunk[3] as u32) & 0xFF) as usize] ^ (crc >> 8);
}
for &b in remainder { crc = TABLE[((crc ^ b as u32) & 0xFF) as usize] ^ (crc >> 8); }
crc ^ 0xFFFF_FFFF
}
#[inline] fn zigzag_enc(n: i64) -> u64 { ((n << 1) ^ (n >> 63)) as u64 }
#[inline] fn zigzag_dec(n: u64) -> i64 { ((n >> 1) as i64) ^ -((n & 1) as i64) }
fn write_varint(buf: &mut Vec<u8>, mut v: u64) {
loop {
let b = (v & 0x7F) as u8; v >>= 7;
if v == 0 { buf.push(b); break; } else { buf.push(b | 0x80); }
}
}
fn write_zvar(buf: &mut Vec<u8>, n: i64) { write_varint(buf, zigzag_enc(n)); }
fn read_varint(data: &[u8], pos: usize) -> (u64, usize) {
let mut r = 0u64; let mut s = 0u32; let mut i = pos;
while i < data.len() {
let b = data[i] as u64; r |= (b & 0x7F) << s; i += 1;
if b & 0x80 == 0 { break; } s += 7; if s >= 64 { break; }
}
(r, i)
}
fn read_zvar(data: &[u8], pos: usize) -> (i64, usize) {
let (v, p) = read_varint(data, pos); (zigzag_dec(v), p)
}
const LZ_WIN: usize = 65535; const LZ_MIN: usize = 6; const LZ_MAX: usize = 65535; const LZ_HASH_BITS: usize = 16; const LZ_HASH_SIZE: usize = 1 << LZ_HASH_BITS;
const LZ_HASH_MASK: usize = LZ_HASH_SIZE - 1;
const LZ_CHAIN_DEPTH: usize = 8;
#[inline]
fn lz_hash4(data: &[u8], pos: usize) -> usize {
let v = u32::from_le_bytes([data[pos], data[pos+1], data[pos+2], data[pos+3]]);
(v.wrapping_mul(0x9E3779B1) >> (32 - LZ_HASH_BITS)) as usize & LZ_HASH_MASK
}
fn lz77_compress(input: &[u8]) -> Vec<u8> {
if input.len() < LZ_MIN + 4 {
let mut out = Vec::with_capacity(input.len() + 4);
for &b in input {
if b == 0xFF {
out.push(0xFF);
out.push(0); out.push(0);
out.push(1); out.push(0);
} else {
out.push(b);
}
}
return out;
}
let mut out = Vec::with_capacity(input.len());
let mut htab = vec![0u32; LZ_HASH_SIZE]; let mut chain = vec![0u32; input.len()]; let mut pos = 0usize;
let limit = input.len().saturating_sub(4);
let mut lit_start: usize = 0;
let mut in_literals = false;
#[inline(always)]
fn flush_literals(out: &mut Vec<u8>, input: &[u8], start: usize, end: usize) {
for &b in &input[start..end] {
if b == 0xFF {
out.push(0xFF);
out.push(0); out.push(0); out.push(1); out.push(0); } else {
out.push(b);
}
}
}
#[inline(always)]
fn find_best_match(input: &[u8], pos: usize, htab: &[u32], chain: &[u32]) -> (usize, usize) {
if pos + 3 >= input.len() { return (0, 0); }
let h = lz_hash4(input, pos);
let mut candidate = htab[h] as usize;
let mut best_len = 0usize;
let mut best_dist = 0usize;
let mut depth = 0;
while candidate > 0 && depth < LZ_CHAIN_DEPTH {
let start = candidate - 1;
let dist = pos - start;
if dist > LZ_WIN { break; }
if dist > 0
&& input[start] == input[pos]
&& input[start+1] == input[pos+1]
&& input[start+2] == input[pos+2]
&& input[start+3] == input[pos+3]
{
let mut len = 4;
let max_possible = LZ_MAX.min(input.len() - pos).min(input.len() - start);
while len < max_possible && input[start + len] == input[pos + len] {
len += 1;
}
if len > best_len {
best_len = len;
best_dist = dist;
if len >= 128 { break; } }
}
candidate = chain[start] as usize;
depth += 1;
}
if best_len >= LZ_MIN { (best_dist, best_len) } else { (0, 0) }
}
while pos < input.len() {
if pos >= limit {
if !in_literals { lit_start = pos; in_literals = true; }
pos += 1;
continue;
}
let h = lz_hash4(input, pos);
chain[pos] = htab[h];
htab[h] = (pos + 1) as u32;
let (dist, len) = find_best_match(input, pos, &htab, &chain);
if len >= LZ_MIN {
if pos + 1 < limit && len < 128 {
let h2 = lz_hash4(input, pos + 1);
chain[pos + 1] = htab[h2];
htab[h2] = (pos + 2) as u32;
let (_, len2) = find_best_match(input, pos + 1, &htab, &chain);
if len2 > len + 1 {
if !in_literals { lit_start = pos; in_literals = true; }
pos += 1;
continue;
}
}
if in_literals { flush_literals(&mut out, input, lit_start, pos); in_literals = false; }
out.push(0xFF);
out.extend_from_slice(&(dist as u16).to_le_bytes());
out.extend_from_slice(&(len as u16).to_le_bytes());
let step = if len > 64 { 4 } else if len > 32 { 2 } else { 1 };
let mut k = 1;
while k < len && pos + k < limit {
let hk = lz_hash4(input, pos + k);
chain[pos + k] = htab[hk];
htab[hk] = (pos + k + 1) as u32;
k += step;
}
pos += len;
} else {
if !in_literals { lit_start = pos; in_literals = true; }
pos += 1;
}
}
if in_literals { flush_literals(&mut out, input, lit_start, pos); }
out
}
fn lz77_decompress(input: &[u8]) -> Vec<u8> {
let mut out = Vec::with_capacity(input.len() * 2);
let mut i = 0;
while i < input.len() {
if input[i] == 0xFF && i + 4 < input.len() {
let off = u16::from_le_bytes([input[i+1], input[i+2]]) as usize;
let len = u16::from_le_bytes([input[i+3], input[i+4]]) as usize;
i += 5;
if off == 0 && len == 1 {
out.push(0xFF);
} else if off == 0 || out.len() < off {
continue;
} else {
let base = out.len() - off;
if base + len <= out.len() {
let start = out.len();
out.resize(start + len, 0);
out.copy_within(base..base+len, start);
} else {
for j in 0..len {
let b = out[base + j];
out.push(b);
}
}
}
} else {
let start = i;
i += 1;
while i < input.len() && input[i] != 0xFF { i += 1; }
out.extend_from_slice(&input[start..i]);
}
}
out
}
fn huffman_compress(input: &[u8]) -> Vec<u8> {
if input.is_empty() { return Vec::new(); }
let mut freq = [0u32; 256];
for &b in input { freq[b as usize] += 1; }
let active: usize = freq.iter().filter(|&&f| f > 0).count();
if active == 0 { return Vec::new(); }
#[derive(Eq, PartialEq)]
struct Node { freq: u32, sym: Option<u8>, left: Option<Box<Node>>, right: Option<Box<Node>> }
impl Ord for Node { fn cmp(&self, o: &Self) -> std::cmp::Ordering { o.freq.cmp(&self.freq) } }
impl PartialOrd for Node { fn partial_cmp(&self, o: &Self) -> Option<std::cmp::Ordering> { Some(self.cmp(o)) } }
let mut heap = std::collections::BinaryHeap::new();
for (s, &f) in freq.iter().enumerate() {
if f > 0 { heap.push(Box::new(Node { freq: f, sym: Some(s as u8), left: None, right: None })); }
}
if heap.len() == 1 {
let sym = heap.pop().unwrap().sym.unwrap();
let bitstream_len = input.len().div_ceil(8);
let mut out = Vec::with_capacity(2 + 2 + 4 + bitstream_len);
out.push(0xFF); out.push(1); out.push(sym);
out.push(1); out.extend_from_slice(&(input.len() as u32).to_le_bytes());
out.resize(out.len() + bitstream_len, 0); return out;
}
while heap.len() > 1 {
let a = heap.pop().unwrap();
let b = heap.pop().unwrap();
heap.push(Box::new(Node { freq: a.freq + b.freq, sym: None, left: Some(a), right: Some(b) }));
}
let root = heap.pop().unwrap();
let mut code_lens = [0u8; 256];
fn assign(node: &Node, depth: u8, lens: &mut [u8; 256]) {
if let Some(sym) = node.sym { lens[sym as usize] = depth.max(1); }
else {
if let Some(ref l) = node.left { assign(l, depth + 1, lens); }
if let Some(ref r) = node.right { assign(r, depth + 1, lens); }
}
}
assign(&root, 0, &mut code_lens);
for l in code_lens.iter_mut() { if *l > 15 { *l = 15; } }
let mut syms_by_len: Vec<(u8, u8)> = code_lens.iter().enumerate()
.filter(|(_, &l)| l > 0).map(|(s, &l)| (l, s as u8)).collect();
syms_by_len.sort();
let mut codes = [0u32; 256];
let mut code = 0u32;
let mut prev_len = 0u8;
for &(len, sym) in &syms_by_len {
code <<= len - prev_len;
codes[sym as usize] = code;
code += 1;
prev_len = len;
}
let mut bitbuf: u64 = 0;
let mut bitpos: u32 = 0;
let mut bitstream = Vec::with_capacity(input.len());
for &b in input {
let len = code_lens[b as usize] as u32;
let c = codes[b as usize] as u64;
bitbuf |= c << (64 - bitpos - len);
bitpos += len;
while bitpos >= 8 { bitstream.push((bitbuf >> 56) as u8); bitbuf <<= 8; bitpos -= 8; }
}
if bitpos > 0 { bitstream.push((bitbuf >> 56) as u8); }
let active_entries: Vec<(u8, u8)> = code_lens.iter().enumerate()
.filter(|(_, &l)| l > 0).map(|(s, &l)| (s as u8, l)).collect();
let sparse_hdr_sz = 1 + 1 + active_entries.len() * 2 + 4; let full_hdr_sz = 256 + 4;
let mut out;
if sparse_hdr_sz < full_hdr_sz {
out = Vec::with_capacity(sparse_hdr_sz + bitstream.len());
out.push(0xFF); out.push(active_entries.len() as u8);
for &(sym, len) in &active_entries {
out.push(sym);
out.push(len);
}
} else {
out = Vec::with_capacity(full_hdr_sz + bitstream.len());
out.push(0xFE); out.extend_from_slice(&code_lens);
}
out.extend_from_slice(&(input.len() as u32).to_le_bytes());
out.extend_from_slice(&bitstream);
out
}
fn huffman_decompress(input: &[u8]) -> Vec<u8> {
if input.is_empty() { return Vec::new(); }
let mut code_lens = [0u8; 256];
let (orig_len, bitstream_start);
match input[0] {
0xFF => {
if input.len() < 6 { return Vec::new(); }
let count = input[1] as usize;
let pairs_end = 2 + count * 2;
if input.len() < pairs_end + 4 { return Vec::new(); }
for i in 0..count {
let sym = input[2 + i * 2] as usize;
let len = input[2 + i * 2 + 1];
if sym < 256 { code_lens[sym] = len; }
}
orig_len = u32::from_le_bytes([
input[pairs_end], input[pairs_end+1], input[pairs_end+2], input[pairs_end+3]
]) as usize;
bitstream_start = pairs_end + 4;
}
0xFE => {
if input.len() < 261 { return Vec::new(); }
code_lens.copy_from_slice(&input[1..257]);
orig_len = u32::from_le_bytes([input[257], input[258], input[259], input[260]]) as usize;
bitstream_start = 261;
}
_ => {
if input.len() < 260 { return Vec::new(); }
code_lens.copy_from_slice(&input[..256]);
orig_len = u32::from_le_bytes([input[256], input[257], input[258], input[259]]) as usize;
bitstream_start = 260;
}
}
let bitstream = &input[bitstream_start..];
let mut syms_by_len: Vec<(u8, u8)> = code_lens.iter().enumerate()
.filter(|(_, &l)| l > 0).map(|(s, &l)| (l, s as u8)).collect();
syms_by_len.sort();
if syms_by_len.is_empty() { return Vec::new(); }
let mut lookup = [(0u8, 0u8); 1 << 15];
let mut code = 0u32;
let mut prev_len = 0u8;
for &(len, sym) in &syms_by_len {
code <<= len - prev_len;
let shift = 15 - len;
let base = (code as usize) << shift;
let count = 1usize << shift;
for i in 0..count {
if base + i < lookup.len() {
lookup[base + i] = (sym, len);
}
}
code += 1;
prev_len = len;
}
let mut out = Vec::with_capacity(orig_len);
let mut bitbuf: u64 = 0;
let mut bits_avail = 0u32;
let mut byte_pos = 0usize;
while out.len() < orig_len {
while bits_avail <= 48 && byte_pos < bitstream.len() {
bitbuf |= (bitstream[byte_pos] as u64) << (56 - bits_avail);
bits_avail += 8;
byte_pos += 1;
}
if bits_avail == 0 { break; }
let peek = (bitbuf >> 49) as usize & 0x7FFF;
let (sym, len) = lookup[peek];
if len == 0 { break; }
out.push(sym);
bitbuf <<= len;
bits_avail -= len as u32;
}
out.truncate(orig_len);
out
}
#[allow(dead_code)]
const RC_TOP: u32 = 1 << 24;
const RC_BOT: u32 = 1 << 16;
const RC_SCALE: u32 = 1 << 14;
fn range_compress(input: &[u8]) -> Vec<u8> {
if input.is_empty() { return Vec::new(); }
let mut freq = [0u32; 256];
for &b in input { freq[b as usize] += 1; }
let active: usize = freq.iter().filter(|&&f| f > 0).count();
if active == 0 { return Vec::new(); }
if active == 1 {
let sym = freq.iter().position(|&f| f > 0).unwrap() as u8;
let mut out = Vec::with_capacity(9);
out.extend_from_slice(&1u16.to_le_bytes()); out.push(sym);
out.extend_from_slice(&(RC_SCALE as u16).to_le_bytes());
out.extend_from_slice(&(input.len() as u32).to_le_bytes());
return out;
}
let total: u64 = input.len() as u64;
let mut norm = [0u16; 256];
let mut norm_total: u32 = 0;
for i in 0..256 {
if freq[i] > 0 {
norm[i] = ((freq[i] as u64 * RC_SCALE as u64 / total).max(1)) as u16;
norm_total += norm[i] as u32;
}
}
while norm_total > RC_SCALE {
let max_i = (0..256).filter(|&i| norm[i] > 1).max_by_key(|&i| norm[i]).unwrap();
norm[max_i] -= 1; norm_total -= 1;
}
while norm_total < RC_SCALE {
let max_i = (0..256).filter(|&i| norm[i] > 0).max_by_key(|&i| freq[i]).unwrap();
norm[max_i] += 1; norm_total += 1;
}
let mut cdf = [0u32; 257];
for (i, &v) in norm.iter().enumerate() { cdf[i + 1] = cdf[i] + v as u32; }
let mut out = Vec::with_capacity(input.len());
out.extend_from_slice(&(active as u16).to_le_bytes());
for (i, &v) in norm.iter().enumerate() {
if v > 0 {
out.push(i as u8);
out.extend_from_slice(&v.to_le_bytes());
}
}
out.extend_from_slice(&(input.len() as u32).to_le_bytes());
let mut low: u32 = 0;
let mut range: u32 = u32::MAX;
for &b in input {
let sym = b as usize;
let r = range / RC_SCALE;
low = low.wrapping_add(r * cdf[sym]);
range = if sym + 1 < 257 && cdf[sym + 1] - cdf[sym] < RC_SCALE {
r * (cdf[sym + 1] - cdf[sym])
} else {
range - r * cdf[sym]
};
while range < RC_BOT {
out.push((low >> 24) as u8);
low <<= 8;
range <<= 8;
}
}
out.push((low >> 24) as u8); low <<= 8;
out.push((low >> 24) as u8); low <<= 8;
out.push((low >> 24) as u8); low <<= 8;
out.push((low >> 24) as u8);
out
}
fn range_decompress(input: &[u8]) -> Vec<u8> {
if input.len() < 2 { return Vec::new(); }
let mut p = 0usize;
let active = u16::from_le_bytes([input[p], input[p + 1]]) as usize; p += 2;
if active == 0 { return Vec::new(); }
let mut norm = [0u16; 256];
for _ in 0..active {
if p >= input.len() { return Vec::new(); }
let sym = input[p] as usize; p += 1;
if p + 1 >= input.len() { return Vec::new(); }
norm[sym] = u16::from_le_bytes([input[p], input[p + 1]]); p += 2;
}
if p + 3 >= input.len() { return Vec::new(); }
let orig_len = u32::from_le_bytes([input[p], input[p+1], input[p+2], input[p+3]]) as usize;
p += 4;
if active == 1 {
let sym = (0..256).find(|&i| norm[i] > 0).unwrap_or(0) as u8;
return vec![sym; orig_len];
}
let mut cdf = [0u32; 257];
for (i, &v) in norm.iter().enumerate() { cdf[i + 1] = cdf[i] + v as u32; }
let mut sym_lookup = vec![0u8; RC_SCALE as usize];
for i in 0..256 {
if norm[i] > 0 {
for j in cdf[i]..cdf[i + 1] {
sym_lookup[j as usize] = i as u8;
}
}
}
let coded = &input[p..];
let mut code: u32 = 0;
let mut cp = 0usize;
for _ in 0..4 {
code = (code << 8) | coded.get(cp).copied().unwrap_or(0) as u32;
cp += 1;
}
let mut low: u32 = 0;
let mut range: u32 = u32::MAX;
let mut out = Vec::with_capacity(orig_len);
for _ in 0..orig_len {
let r = range / RC_SCALE;
let offset = ((code.wrapping_sub(low)) / r).min(RC_SCALE - 1);
let sym = sym_lookup[offset as usize];
let si = sym as usize;
low = low.wrapping_add(r * cdf[si]);
range = if cdf[si + 1] - cdf[si] < RC_SCALE {
r * (cdf[si + 1] - cdf[si])
} else {
range - r * cdf[si]
};
while range < RC_BOT {
low <<= 8;
range <<= 8;
code = (code << 8) | coded.get(cp).copied().unwrap_or(0) as u32;
cp += 1;
}
out.push(sym);
}
out
}
fn compress_block(data: &[u8]) -> Vec<u8> {
if data.is_empty() { return Vec::new(); }
let raw_sz = 1 + data.len();
let lz = lz77_compress(data);
let lz_sz = 1 + lz.len();
if lz.len() < 512 {
if raw_sz <= lz_sz {
let mut out = Vec::with_capacity(raw_sz);
out.push(0x02);
out.extend_from_slice(data);
return out;
}
let mut out = Vec::with_capacity(lz_sz);
out.push(0x00);
out.extend_from_slice(&lz);
return out;
}
let huff_lz = huffman_compress(&lz);
let huff_lz_sz = 1 + huff_lz.len();
let huff_raw = if lz.len() * 100 > data.len() * 85 && data.len() >= 512 {
huffman_compress(data)
} else { Vec::new() };
let huff_raw_sz = if huff_raw.is_empty() { usize::MAX } else { 1 + huff_raw.len() };
let (rc_lz, rc_lz_sz, rc_raw, rc_raw_sz) = if huff_lz_sz * 100 < raw_sz * 70 {
(Vec::new(), usize::MAX, Vec::new(), usize::MAX)
} else {
let rl = range_compress(&lz);
let rl_sz = if rl.is_empty() { usize::MAX } else { 1 + rl.len() };
let rr = if data.len() >= 256 { range_compress(data) } else { Vec::new() };
let rr_sz = if rr.is_empty() { usize::MAX } else { 1 + rr.len() };
(rl, rl_sz, rr, rr_sz)
};
let min_sz = raw_sz.min(lz_sz).min(huff_lz_sz).min(huff_raw_sz).min(rc_lz_sz).min(rc_raw_sz);
if min_sz == rc_raw_sz {
let mut out = Vec::with_capacity(rc_raw_sz);
out.push(0x04);
out.extend_from_slice(&rc_raw);
out
} else if min_sz == rc_lz_sz {
let mut out = Vec::with_capacity(rc_lz_sz);
out.push(0x05);
out.extend_from_slice(&rc_lz);
out
} else if min_sz == raw_sz {
let mut out = Vec::with_capacity(raw_sz);
out.push(0x02);
out.extend_from_slice(data);
out
} else if min_sz == huff_lz_sz {
let mut out = Vec::with_capacity(huff_lz_sz);
out.push(0x01);
out.extend_from_slice(&huff_lz);
out
} else if min_sz == huff_raw_sz {
let mut out = Vec::with_capacity(huff_raw_sz);
out.push(0x03);
out.extend_from_slice(&huff_raw);
out
} else {
let mut out = Vec::with_capacity(lz_sz);
out.push(0x00);
out.extend_from_slice(&lz);
out
}
}
fn decompress_block(data: &[u8]) -> Vec<u8> {
if data.is_empty() { return Vec::new(); }
match data[0] {
0x01 => lz77_decompress(&huffman_decompress(&data[1..])),
0x02 => data[1..].to_vec(), 0x03 => huffman_decompress(&data[1..]), 0x04 => range_decompress(&data[1..]), 0x05 => lz77_decompress(&range_decompress(&data[1..])), _ => lz77_decompress(&data[1..]),
}
}
fn derive_nonce(col_name: &str, chunk_idx: usize) -> [u8; 12] {
let mut h: u64 = 0xcbf29ce484222325;
for b in col_name.bytes() {
h ^= b as u64;
h = h.wrapping_mul(0x100000001b3);
}
h ^= chunk_idx as u64;
h = h.wrapping_mul(0x100000001b3);
let h2 = h.wrapping_mul(0x517cc1b727220a95);
let mut nonce = [0u8; 12];
nonce[..8].copy_from_slice(&h.to_le_bytes());
nonce[8..12].copy_from_slice(&h2.to_le_bytes()[..4]);
nonce
}
#[allow(dead_code)]
fn xor_crypt(data: &[u8], key: &[u8; 32]) -> Vec<u8> {
if key == &[0u8; 32] { return data.to_vec(); }
let mut state: u64 = u64::from_le_bytes(key[..8].try_into().unwrap_or([0u8; 8]));
let mut out = Vec::with_capacity(data.len());
for (ki, &b) in data.iter().enumerate() {
state ^= key[ki % 32] as u64;
state = state.wrapping_mul(0x9e3779b97f4a7c15).rotate_left(17);
out.push(b ^ (state >> 32) as u8);
}
out
}
#[derive(Clone)]
pub struct Bloom {
bits: [u64; 64],
}
impl Bloom {
pub fn new() -> Self { Bloom { bits: [0u64; 64] } }
fn hash(seed: u64, s: &str) -> usize {
let mut h = seed;
for b in s.bytes() { h ^= b as u64; h = h.wrapping_mul(0x517cc1b727220a95); }
h as usize % 4096
}
pub fn insert(&mut self, s: &str) {
for &seed in &[0x9e3779b97f4a7c15u64, 0x6c62272e07bb0142, 0xbf58476d1ce4e5b9] {
let pos = Self::hash(seed, s);
self.bits[pos / 64] |= 1u64 << (pos % 64);
}
}
pub fn may_contain(&self, s: &str) -> bool {
[0x9e3779b97f4a7c15u64, 0x6c62272e07bb0142, 0xbf58476d1ce4e5b9]
.iter().all(|&seed| {
let pos = Self::hash(seed, s);
self.bits[pos / 64] & (1u64 << (pos % 64)) != 0
})
}
fn to_bytes(&self) -> Vec<u8> {
let mut out = Vec::with_capacity(512);
for &w in &self.bits { out.extend_from_slice(&w.to_le_bytes()); }
out
}
#[allow(dead_code)]
fn from_bytes(data: &[u8]) -> Self {
let mut bf = Bloom::new();
for (i, chunk) in data.chunks(8).enumerate() {
if i < 64 && chunk.len() == 8 {
bf.bits[i] = u64::from_le_bytes(chunk.try_into().unwrap_or([0; 8]));
}
}
bf
}
}
impl Default for Bloom {
fn default() -> Self { Bloom::new() }
}
fn encode_rle_int(nums: &[i64]) -> Vec<u8> {
if nums.is_empty() { let mut b = Vec::new(); write_varint(&mut b, 0); return b; }
let mut runs: Vec<(u32, i64)> = Vec::new();
let (mut cur, mut cnt) = (nums[0], 1u32);
for &n in &nums[1..] {
if n == cur { cnt += 1; } else { runs.push((cnt, cur)); cur = n; cnt = 1; }
}
runs.push((cnt, cur));
let mut buf = Vec::new();
write_varint(&mut buf, runs.len() as u64);
for (c, v) in runs { write_varint(&mut buf, c as u64); write_zvar(&mut buf, v); }
buf
}
fn decode_rle_int(data: &[u8], pos: usize, nrows: usize) -> (Vec<i64>, usize) {
let (nruns, mut p) = read_varint(data, pos);
let mut out = Vec::with_capacity(nrows);
for _ in 0..nruns {
let (cnt, p2) = read_varint(data, p);
let (val, p3) = read_zvar(data, p2);
p = p3;
for _ in 0..cnt { out.push(val); }
}
(out, p)
}
fn encode_delta_int(nums: &[i64]) -> Vec<u8> {
if nums.is_empty() { return Vec::new(); }
let mut buf = Vec::new();
write_zvar(&mut buf, nums[0]);
for i in 1..nums.len() { write_zvar(&mut buf, nums[i] - nums[i-1]); }
buf
}
fn decode_delta_int(data: &[u8], pos: usize, nrows: usize) -> (Vec<i64>, usize) {
if nrows == 0 { return (Vec::new(), pos); }
let (base, mut p) = read_zvar(data, pos);
let mut deltas = Vec::with_capacity(nrows - 1);
for _ in 1..nrows {
let (d, p2) = read_zvar(data, p);
deltas.push(d);
p = p2;
}
let mut out = Vec::with_capacity(nrows);
out.push(base);
let n = deltas.len();
let mut acc = base;
let chunks = n / 4;
for c in 0..chunks {
let i = c * 4;
acc += deltas[i]; out.push(acc);
acc += deltas[i+1]; out.push(acc);
acc += deltas[i+2]; out.push(acc);
acc += deltas[i+3]; out.push(acc);
}
for &d in deltas.iter().skip(chunks * 4) {
acc += d;
out.push(acc);
}
(out, p)
}
fn encode_dict_rle(vals: &[&str], global_dict: &HashMap<String, u32>) -> Vec<u8> {
let indices: Vec<i64> = vals.iter()
.map(|v| *global_dict.get(*v).unwrap_or(&0) as i64)
.collect();
encode_rle_int(&indices)
}
fn decode_dict_rle(data: &[u8], pos: usize, nrows: usize, dict: &[String]) -> (Vec<String>, usize) {
let (indices, p) = decode_rle_int(data, pos, nrows);
let strs: Vec<String> = indices.iter()
.map(|&i| dict.get(i as usize).cloned().unwrap_or_default())
.collect();
(strs, p)
}
fn encode_bitpack(bits: &[bool]) -> Vec<u8> {
let mut out = Vec::with_capacity(bits.len().div_ceil(8));
for chunk in bits.chunks(8) {
let mut byte = 0u8;
for (i, &b) in chunk.iter().enumerate() { if b { byte |= 1 << i; } }
out.push(byte);
}
out
}
fn decode_bitpack(data: &[u8], pos: usize, nrows: usize) -> (Vec<bool>, usize) {
let nbytes = nrows.div_ceil(8);
let mut out = Vec::with_capacity(nrows);
for i in 0..nrows {
let byte_idx = pos + i / 8;
let b = data.get(byte_idx).copied().unwrap_or(0);
out.push((b >> (i % 8)) & 1 == 1);
}
(out, pos + nbytes)
}
fn encode_bdict(vals: &[&str]) -> Vec<u8> {
let mut freq_map: HashMap<&str, u32> = HashMap::new();
for &v in vals { *freq_map.entry(v).or_insert(0) += 1; }
let mut entries: Vec<(&str, u32)> = freq_map.into_iter().collect();
entries.sort_by(|a, b| b.1.cmp(&a.1).then(a.0.cmp(b.0)));
let mut dict_map: HashMap<&str, u32> = HashMap::with_capacity(entries.len());
let mut dict_list: Vec<&str> = Vec::with_capacity(entries.len());
for (s, _) in &entries {
dict_map.insert(s, dict_list.len() as u32);
dict_list.push(s);
}
let n_unique = dict_list.len();
let bits_per = if n_unique <= 1 { 1 } else { (64 - (n_unique as u64 - 1).leading_zeros()) as usize };
let mut buf = Vec::new();
write_varint(&mut buf, n_unique as u64);
for &s in &dict_list {
let b = s.as_bytes();
write_varint(&mut buf, b.len() as u64);
buf.extend_from_slice(b);
}
buf.push(bits_per as u8);
let mut bitbuf: u64 = 0;
let mut bitpos: u32 = 0;
for &v in vals {
let idx = dict_map[v] as u64;
bitbuf |= idx << bitpos;
bitpos += bits_per as u32;
while bitpos >= 8 {
buf.push((bitbuf & 0xFF) as u8);
bitbuf >>= 8;
bitpos -= 8;
}
}
if bitpos > 0 { buf.push((bitbuf & 0xFF) as u8); }
buf
}
fn decode_bdict(data: &[u8], pos: usize, nrows: usize) -> (Vec<String>, usize) {
let mut p = pos;
let (n_unique, np) = read_varint(data, p); p = np;
let mut dict: Vec<String> = Vec::with_capacity(n_unique as usize);
for _ in 0..n_unique {
let (slen, np) = read_varint(data, p); p = np;
let end = p + slen as usize;
dict.push(String::from_utf8_lossy(&data[p..end.min(data.len())]).into_owned());
p = end;
}
let bits_per = data.get(p).copied().unwrap_or(1) as usize; p += 1;
let mask = (1u64 << bits_per) - 1;
let mut out = Vec::with_capacity(nrows);
let mut bitbuf: u64 = 0;
let mut bits_avail: u32 = 0;
for _ in 0..nrows {
while bits_avail < bits_per as u32 && p < data.len() {
bitbuf |= (data[p] as u64) << bits_avail;
bits_avail += 8;
p += 1;
}
let idx = (bitbuf & mask) as usize;
bitbuf >>= bits_per;
bits_avail -= bits_per as u32;
out.push(dict.get(idx).cloned().unwrap_or_default());
}
(out, p)
}
fn encode_huffdict(vals: &[&str]) -> Vec<u8> {
let mut freq_map: HashMap<&str, u32> = HashMap::new();
for &v in vals { *freq_map.entry(v).or_insert(0) += 1; }
let mut entries: Vec<(&str, u32)> = freq_map.into_iter().collect();
entries.sort_by(|a, b| b.1.cmp(&a.1).then(a.0.cmp(b.0)));
let n_unique = entries.len();
if n_unique > 256 {
return encode_bdict(vals);
}
let mut dict_map: HashMap<&str, u32> = HashMap::with_capacity(n_unique);
let mut dict_list: Vec<&str> = Vec::with_capacity(n_unique);
for (s, _) in &entries {
dict_map.insert(s, dict_list.len() as u32);
dict_list.push(s);
}
let mut buf = Vec::new();
write_varint(&mut buf, n_unique as u64);
for &s in &dict_list {
let b = s.as_bytes();
write_varint(&mut buf, b.len() as u64);
buf.extend_from_slice(b);
}
let indices: Vec<u8> = vals.iter().map(|&v| dict_map[v] as u8).collect();
let huff = huffman_compress(&indices);
buf.extend_from_slice(&huff);
buf
}
fn decode_huffdict(data: &[u8], pos: usize, nrows: usize) -> (Vec<String>, usize) {
let mut p = pos;
let (n_unique, np) = read_varint(data, p); p = np;
let mut dict: Vec<String> = Vec::with_capacity(n_unique as usize);
for _ in 0..n_unique {
let (slen, np) = read_varint(data, p); p = np;
let end = p + slen as usize;
dict.push(String::from_utf8_lossy(&data[p..end.min(data.len())]).into_owned());
p = end;
}
let indices = huffman_decompress(&data[p..]);
let out: Vec<String> = indices.iter()
.take(nrows)
.map(|&idx| dict.get(idx as usize).cloned().unwrap_or_default())
.collect();
(out, data.len())
}
fn encode_cdelta(nums: &[i64]) -> Vec<u8> {
let mut buf = Vec::new();
if nums.is_empty() { write_zvar(&mut buf, 0); write_zvar(&mut buf, 0); return buf; }
let base = nums[0];
let step = if nums.len() > 1 { nums[1] - nums[0] } else { 0 };
write_zvar(&mut buf, base);
write_zvar(&mut buf, step);
buf
}
fn decode_cdelta(data: &[u8], pos: usize, nrows: usize) -> (Vec<i64>, usize) {
let (base, p1) = read_zvar(data, pos);
let (step, p2) = read_zvar(data, p1);
let out: Vec<i64> = (0..nrows as i64).map(|i| base + step * i).collect();
(out, p2)
}
fn is_cdelta(nums: &[i64]) -> bool {
if nums.len() <= 2 { return true; }
let step = nums[1] - nums[0];
nums.windows(2).all(|w| w[1] - w[0] == step)
}
fn encode_for(nums: &[i64]) -> Vec<u8> {
if nums.is_empty() { return Vec::new(); }
let min_val = *nums.iter().min().unwrap();
let max_val = *nums.iter().max().unwrap();
let range = (max_val - min_val) as u64;
let bits_per = if range == 0 { 0 } else { 64 - range.leading_zeros() } as usize;
let mut buf = Vec::new();
write_zvar(&mut buf, min_val);
buf.push(bits_per as u8);
if bits_per == 0 { return buf; }
let mask = (1u64 << bits_per) - 1;
let mut bitbuf: u64 = 0;
let mut bitpos: u32 = 0;
for &n in nums {
let residual = (n - min_val) as u64 & mask;
bitbuf |= residual << bitpos;
bitpos += bits_per as u32;
while bitpos >= 8 {
buf.push((bitbuf & 0xFF) as u8);
bitbuf >>= 8;
bitpos -= 8;
}
}
if bitpos > 0 { buf.push((bitbuf & 0xFF) as u8); }
buf
}
fn decode_for(data: &[u8], pos: usize, nrows: usize) -> (Vec<i64>, usize) {
let (min_val, mut p) = read_zvar(data, pos);
let bits_per = data.get(p).copied().unwrap_or(0) as usize; p += 1;
if bits_per == 0 {
return (vec![min_val; nrows], p);
}
let mask = (1u64 << bits_per) - 1;
let total_bytes = (bits_per * nrows).div_ceil(8);
let avail = data.len().saturating_sub(p);
let packed = &data[p..p + total_bytes.min(avail)];
let mut out = Vec::with_capacity(nrows);
let mut bitpos: usize = 0;
let safe_bitpos = if packed.len() >= 8 { (packed.len() - 7) * 8 } else { 0 };
let bulk_count = if bits_per > 0 && safe_bitpos > 0 { (safe_bitpos - 1) / bits_per } else { 0 };
let bulk_n = bulk_count.min(nrows);
for _ in 0..bulk_n {
let byte_start = bitpos >> 3;
let bit_offset = (bitpos & 7) as u32;
let word = u64::from_le_bytes([
packed[byte_start], packed[byte_start+1], packed[byte_start+2], packed[byte_start+3],
packed[byte_start+4], packed[byte_start+5], packed[byte_start+6], packed[byte_start+7],
]);
out.push(min_val + ((word >> bit_offset) & mask) as i64);
bitpos += bits_per;
}
for _ in bulk_n..nrows {
let byte_start = bitpos >> 3;
let bit_offset = (bitpos & 7) as u32;
let mut tmp = [0u8; 8];
let tail = packed.len().saturating_sub(byte_start).min(8);
tmp[..tail].copy_from_slice(&packed[byte_start..byte_start + tail]);
let word = u64::from_le_bytes(tmp);
out.push(min_val + ((word >> bit_offset) & mask) as i64);
bitpos += bits_per;
}
(out, p + bitpos.div_ceil(8))
}
fn encode_rle_str(vals: &[&str]) -> Vec<u8> {
if vals.is_empty() { let mut b = Vec::new(); write_varint(&mut b, 0); return b; }
let mut runs: Vec<(u32, &str)> = Vec::new();
let (mut cur, mut cnt) = (vals[0], 1u32);
for &v in &vals[1..] {
if v == cur { cnt += 1; } else { runs.push((cnt, cur)); cur = v; cnt = 1; }
}
runs.push((cnt, cur));
let mut buf = Vec::new();
write_varint(&mut buf, runs.len() as u64);
for (c, s) in &runs {
write_varint(&mut buf, *c as u64);
let b = s.as_bytes();
write_varint(&mut buf, b.len() as u64);
buf.extend_from_slice(b);
}
buf
}
fn decode_rle_str(data: &[u8], pos: usize, nrows: usize) -> (Vec<String>, usize) {
let (nruns, mut p) = read_varint(data, pos);
let mut out = Vec::with_capacity(nrows);
for _ in 0..nruns {
if p >= data.len() { break; }
let (cnt, p2) = read_varint(data, p);
let (slen, p3) = read_varint(data, p2);
let end = p3 + slen as usize;
let safe_end = end.min(data.len());
let s = if p3 <= safe_end {
String::from_utf8_lossy(&data[p3..safe_end]).into_owned()
} else {
String::new()
};
p = end;
for _ in 0..cnt { out.push(s.clone()); }
}
(out, p)
}
fn encode_raw_str(vals: &[&str]) -> Vec<u8> {
let mut buf = Vec::new();
for &s in vals {
let b = s.as_bytes();
write_varint(&mut buf, b.len() as u64);
buf.extend_from_slice(b);
}
buf
}
fn decode_raw_str(data: &[u8], pos: usize, nrows: usize) -> (Vec<String>, usize) {
let mut out = Vec::with_capacity(nrows);
let mut p = pos;
for _ in 0..nrows {
let (slen, np) = read_varint(data, p);
let end = np + slen as usize;
out.push(String::from_utf8_lossy(&data[np..end.min(data.len())]).into_owned());
p = end;
}
(out, p)
}
fn select_int_codec(nums: &[i64]) -> Codec {
if nums.is_empty() { return Codec::Raw; }
if is_cdelta(nums) { return Codec::CDelta; }
let min = *nums.iter().min().unwrap();
let max = *nums.iter().max().unwrap();
let range = (max - min) as u64;
let bits_for = if range == 0 { 0 } else { 64 - range.leading_zeros() } as usize;
let rle_sz = encode_rle_int(nums).len();
let delta_sz = encode_delta_int(nums).len();
let for_sz = if bits_for <= 32 { encode_for(nums).len() } else { usize::MAX };
let mut sorted = nums.to_vec(); sorted.sort_unstable(); sorted.dedup();
let uniq = sorted.len();
if uniq <= 1 { return Codec::RLE; } if for_sz <= rle_sz && for_sz <= delta_sz { return Codec::FOR; }
if delta_sz <= rle_sz { Codec::Delta } else { Codec::RLE }
}
fn select_str_codec(vals: &[&str]) -> Codec {
if vals.is_empty() { return Codec::Raw; }
let mut seen: std::collections::HashSet<&str> = std::collections::HashSet::with_capacity(256);
for &v in vals {
seen.insert(v);
if seen.len() > 65536 { return Codec::Raw; } }
let uniq = seen.len();
if uniq <= 1 { return Codec::RLE; }
if uniq <= 65536 {
let bits_per = 64 - (uniq as u64 - 1).leading_zeros();
let dict_overhead: usize = seen.iter().map(|k| k.len() + 5).sum();
let bdict_est = dict_overhead + (bits_per as usize * vals.len()).div_ceil(8);
let mut runs = 1usize;
for i in 1..vals.len() {
if vals[i] != vals[i - 1] { runs += 1; }
}
let avg_str_len = seen.iter().map(|s| s.len()).sum::<usize>() / uniq.max(1);
let rle_est = runs * (avg_str_len + 5);
if rle_est <= bdict_est { return Codec::RLE; }
return Codec::BDict;
}
Codec::Raw
}
#[allow(dead_code)]
fn select_encode_str_col(vals: &[KVal]) -> (Codec, Vec<u8>, ColStats, Bloom) {
let n = vals.len();
if n == 0 {
return (Codec::Raw, Vec::new(), ColStats { null_count: 0, min_i64: 0, max_i64: 0, min_str: String::new(), max_str: String::new() }, Bloom::new());
}
let mut dict_map: HashMap<&str, u32> = HashMap::with_capacity(256);
let mut dict_list: Vec<&str> = Vec::with_capacity(256);
let mut runs = 1usize;
let mut null_count = 0u32;
let mut min_str: Option<&str> = None;
let mut max_str: Option<&str> = None;
let mut high_card = false;
let first_s = vals[0].as_str();
if vals[0].is_null() { null_count += 1; }
else {
min_str = Some(first_s);
max_str = Some(first_s);
}
if !dict_map.contains_key(first_s) { dict_map.insert(first_s, 0); dict_list.push(first_s); }
let mut prev_s = first_s;
for v in vals.iter().take(n).skip(1) {
let s = v.as_str();
if v.is_null() { null_count += 1; }
else {
match min_str {
None => { min_str = Some(s); max_str = Some(s); }
Some(mn) => {
if s < mn { min_str = Some(s); }
if s > max_str.unwrap_or("") { max_str = Some(s); }
}
}
}
if s != prev_s { runs += 1; prev_s = s; }
if !high_card && !dict_map.contains_key(s) {
if dict_list.len() >= 65536 { high_card = true; }
else { dict_map.insert(s, dict_list.len() as u32); dict_list.push(s); }
}
}
let stats = ColStats {
null_count, min_i64: 0, max_i64: 0,
min_str: min_str.unwrap_or("").to_string(),
max_str: max_str.unwrap_or("").to_string(),
};
let uniq = dict_list.len();
let codec;
let encoded;
if high_card || uniq > 65536 {
codec = Codec::Raw;
let strs: Vec<&str> = vals.iter().map(|v| v.as_str()).collect();
encoded = encode_raw_str(&strs);
return (codec, encoded, stats, Bloom::new());
}
if uniq <= 1 {
codec = Codec::RLE;
let strs: Vec<&str> = vals.iter().map(|v| v.as_str()).collect();
encoded = encode_rle_str(&strs);
let mut bloom = Bloom::new();
for &s in &dict_list { bloom.insert(s); }
return (codec, encoded, stats, bloom);
}
let bits_per = 64 - (uniq as u64 - 1).leading_zeros();
let dict_overhead: usize = dict_list.iter().map(|k| k.len() + 5).sum();
let bdict_est = dict_overhead + (bits_per as usize * n).div_ceil(8);
let avg_str_len = dict_list.iter().map(|s| s.len()).sum::<usize>() / uniq.max(1);
let rle_est = runs * (avg_str_len + 5);
if rle_est <= bdict_est {
codec = Codec::RLE;
let strs: Vec<&str> = vals.iter().map(|v| v.as_str()).collect();
encoded = encode_rle_str(&strs);
} else {
codec = Codec::BDict;
let bits = bits_per as usize;
let mut buf = Vec::with_capacity(dict_overhead + (bits * n).div_ceil(8) + 16);
write_varint(&mut buf, uniq as u64);
for &s in &dict_list {
let b = s.as_bytes();
write_varint(&mut buf, b.len() as u64);
buf.extend_from_slice(b);
}
buf.push(bits as u8);
let mut bitbuf: u64 = 0;
let mut bitpos: u32 = 0;
for v in vals {
let idx = dict_map[v.as_str()] as u64;
bitbuf |= idx << bitpos;
bitpos += bits as u32;
while bitpos >= 8 {
buf.push((bitbuf & 0xFF) as u8);
bitbuf >>= 8;
bitpos -= 8;
}
}
if bitpos > 0 { buf.push((bitbuf & 0xFF) as u8); }
encoded = buf;
}
let mut bloom = Bloom::new();
for &s in &dict_list { bloom.insert(s); }
(codec, encoded, stats, bloom)
}
fn encode_column_data(
values: &[KVal],
col: &KColumn,
codec: Codec,
global_dict: &HashMap<String, u32>,
) -> Vec<u8> {
encode_column_data_scaled(values, col, codec, global_dict, 10000.0)
}
fn encode_column_data_scaled(
values: &[KVal],
col: &KColumn,
codec: Codec,
global_dict: &HashMap<String, u32>,
fscale: f64,
) -> Vec<u8> {
match col.ktype {
KType::Bool => {
let bits: Vec<bool> = values.iter().map(|v| match v {
KVal::Bool(b) => *b,
KVal::Int(n) => *n != 0,
KVal::Str(s) => s == "1" || s.eq_ignore_ascii_case("true"),
_ => false,
}).collect();
match codec {
Codec::RLE => {
let nums: Vec<i64> = bits.iter().map(|&b| b as i64).collect();
encode_rle_int(&nums)
}
_ => encode_bitpack(&bits),
}
}
KType::Int => {
let nums: Vec<i64> = values.iter().map(|v| v.as_i64()).collect();
match codec {
Codec::CDelta => encode_cdelta(&nums),
Codec::FOR => encode_for(&nums),
Codec::Delta => encode_delta_int(&nums),
Codec::RLE => encode_rle_int(&nums),
_ => encode_delta_int(&nums),
}
}
KType::Float => {
let scale_exp: u8 = match fscale as u32 {
1 => 0, 10 => 1, 100 => 2, 1000 => 3, _ => 4,
};
let nums: Vec<i64> = values.iter().map(|v| (v.as_f64() * fscale).round() as i64).collect();
let encoded = match codec {
Codec::CDelta => encode_cdelta(&nums),
Codec::FOR => encode_for(&nums),
Codec::Delta => encode_delta_int(&nums),
Codec::RLE => encode_rle_int(&nums),
_ => encode_delta_int(&nums),
};
let mut buf = Vec::with_capacity(2 + encoded.len());
buf.push(0xFE);
buf.push(scale_exp);
buf.extend_from_slice(&encoded);
buf
}
KType::Str => {
let strs: Vec<&str> = values.iter().map(|v| v.as_str()).collect();
match codec {
Codec::HuffDict=> encode_huffdict(&strs),
Codec::BDict => encode_bdict(&strs),
Codec::DictRLE=> encode_dict_rle(&strs, global_dict),
Codec::RLE => encode_rle_str(&strs),
_ => encode_raw_str(&strs),
}
}
KType::Bytes => {
let mut buf = Vec::new();
for v in values {
if let KVal::Bytes(b) = v {
write_varint(&mut buf, b.len() as u64);
buf.extend_from_slice(b);
} else {
write_varint(&mut buf, 0);
}
}
buf
}
KType::Struct | KType::List | KType::Map => {
let mut buf = Vec::new();
for v in values {
let encoded = encode_nested_val(v);
write_varint(&mut buf, encoded.len() as u64);
buf.extend_from_slice(&encoded);
}
buf
}
}
}
fn encode_nested_val(v: &KVal) -> Vec<u8> {
let mut buf = Vec::new();
match v {
KVal::Null => { buf.push(0); }
KVal::Int(n) => { buf.push(1); write_zvar(&mut buf, *n); }
KVal::Float(f) => { buf.push(2); buf.extend_from_slice(&f.to_le_bytes()); }
KVal::Str(s) => { buf.push(3); write_varint(&mut buf, s.len() as u64); buf.extend_from_slice(s.as_bytes()); }
KVal::Bool(b) => { buf.push(4); buf.push(if *b { 1 } else { 0 }); }
KVal::Bytes(b) => { buf.push(5); write_varint(&mut buf, b.len() as u64); buf.extend_from_slice(b); }
KVal::Struct(fields) => {
buf.push(6);
write_varint(&mut buf, fields.len() as u64);
for (name, val) in fields {
write_varint(&mut buf, name.len() as u64);
buf.extend_from_slice(name.as_bytes());
let child = encode_nested_val(val);
write_varint(&mut buf, child.len() as u64);
buf.extend_from_slice(&child);
}
}
KVal::List(items) => {
buf.push(7);
write_varint(&mut buf, items.len() as u64);
for item in items {
let child = encode_nested_val(item);
write_varint(&mut buf, child.len() as u64);
buf.extend_from_slice(&child);
}
}
KVal::Map(pairs) => {
buf.push(8);
write_varint(&mut buf, pairs.len() as u64);
for (k, v2) in pairs {
let ek = encode_nested_val(k);
write_varint(&mut buf, ek.len() as u64);
buf.extend_from_slice(&ek);
let ev = encode_nested_val(v2);
write_varint(&mut buf, ev.len() as u64);
buf.extend_from_slice(&ev);
}
}
}
buf
}
fn decode_nested_val(data: &[u8], pos: usize) -> (KVal, usize) {
if pos >= data.len() { return (KVal::Null, pos); }
let tag = data[pos];
let mut p = pos + 1;
match tag {
0 => (KVal::Null, p),
1 => { let (n, p2) = read_zvar(data, p); (KVal::Int(n), p2) }
2 => {
if p + 8 > data.len() { return (KVal::Null, p); }
let f = f64::from_le_bytes(data[p..p+8].try_into().unwrap_or([0; 8]));
(KVal::Float(f), p + 8)
}
3 => {
let (slen, p2) = read_varint(data, p); p = p2;
let end = (p + slen as usize).min(data.len());
let s = String::from_utf8_lossy(&data[p..end]).into_owned();
(KVal::Str(s), end)
}
4 => { let b = data.get(p).copied().unwrap_or(0) != 0; (KVal::Bool(b), p + 1) }
5 => {
let (blen, p2) = read_varint(data, p); p = p2;
let end = (p + blen as usize).min(data.len());
(KVal::Bytes(data[p..end].to_vec()), end)
}
6 => { let (nfields, p2) = read_varint(data, p); p = p2;
let mut fields = Vec::with_capacity(nfields as usize);
for _ in 0..nfields {
let (nlen, p2) = read_varint(data, p); p = p2;
let end = (p + nlen as usize).min(data.len());
let name = String::from_utf8_lossy(&data[p..end]).into_owned();
p = end;
let (clen, p2) = read_varint(data, p); p = p2;
let (val, _) = decode_nested_val(data, p);
p += clen as usize;
fields.push((name, val));
}
(KVal::Struct(fields), p)
}
7 => { let (nitems, p2) = read_varint(data, p); p = p2;
let mut items = Vec::with_capacity(nitems as usize);
for _ in 0..nitems {
let (clen, p2) = read_varint(data, p); p = p2;
let (val, _) = decode_nested_val(data, p);
p += clen as usize;
items.push(val);
}
(KVal::List(items), p)
}
8 => { let (npairs, p2) = read_varint(data, p); p = p2;
let mut pairs = Vec::with_capacity(npairs as usize);
for _ in 0..npairs {
let (klen, p2) = read_varint(data, p); p = p2;
let (key, _) = decode_nested_val(data, p);
p += klen as usize;
let (vlen, p2) = read_varint(data, p); p = p2;
let (val, _) = decode_nested_val(data, p);
p += vlen as usize;
pairs.push((key, val));
}
(KVal::Map(pairs), p)
}
_ => (KVal::Null, p),
}
}
fn decode_column_data(
data: &[u8],
col: &KColumn,
codec: Codec,
nrows: usize,
dict: &[String],
) -> Vec<KVal> {
match col.ktype {
KType::Bool => {
match codec {
Codec::RLE => {
let (nums, _) = decode_rle_int(data, 0, nrows);
nums.into_iter().map(|n| KVal::Bool(n != 0)).collect()
}
Codec::Raw => {
data[..nrows].iter().map(|&b| KVal::Bool(b != 0)).collect()
}
_ => {
let (bits, _) = decode_bitpack(data, 0, nrows);
bits.into_iter().map(KVal::Bool).collect()
}
}
}
KType::Int => {
let nums = match codec {
Codec::CDelta => decode_cdelta(data, 0, nrows).0,
Codec::FOR => decode_for(data, 0, nrows).0,
Codec::Delta => decode_delta_int(data, 0, nrows).0,
Codec::RLE => decode_rle_int(data, 0, nrows).0,
_ => decode_delta_int(data, 0, nrows).0,
};
nums.into_iter().map(KVal::Int).collect()
}
KType::Float => {
let (scale, float_data) = if data.len() >= 2 && data[0] == 0xFE {
let exp = data[1];
let s = match exp { 0 => 1.0, 1 => 10.0, 2 => 100.0, 3 => 1000.0, _ => 10000.0 };
(s, &data[2..])
} else {
(10000.0, data) };
let nums = match codec {
Codec::CDelta => decode_cdelta(float_data, 0, nrows).0,
Codec::FOR => decode_for(float_data, 0, nrows).0,
Codec::Delta => decode_delta_int(float_data, 0, nrows).0,
Codec::RLE => decode_rle_int(float_data, 0, nrows).0,
_ => decode_delta_int(float_data, 0, nrows).0,
};
nums.into_iter().map(|n| KVal::Float(n as f64 / scale)).collect()
}
KType::Str => {
let strs = match codec {
Codec::HuffDict=> decode_huffdict(data, 0, nrows).0,
Codec::BDict => decode_bdict(data, 0, nrows).0,
Codec::DictRLE => decode_dict_rle(data, 0, nrows, dict).0,
Codec::RLE => decode_rle_str(data, 0, nrows).0,
_ => decode_raw_str(data, 0, nrows).0,
};
strs.into_iter().map(KVal::Str).collect()
}
KType::Bytes => {
let mut out = Vec::with_capacity(nrows);
let mut p = 0;
for _ in 0..nrows {
let (len, np) = read_varint(data, p);
let end = np + len as usize;
out.push(KVal::Bytes(data[np..end.min(data.len())].to_vec()));
p = end;
}
out
}
KType::Struct | KType::List | KType::Map => {
let mut out = Vec::with_capacity(nrows);
let mut p = 0;
for _ in 0..nrows {
let (blen, p2) = read_varint(data, p); p = p2;
let (val, _) = decode_nested_val(data, p);
p += blen as usize;
out.push(val);
}
out
}
}
}
fn compute_stats(values: &[KVal], ktype: KType) -> ColStats {
let mut stats = ColStats {
null_count: 0, min_i64: i64::MAX, max_i64: i64::MIN,
min_str: String::new(), max_str: String::new(),
};
let mut first_str = true;
for v in values {
if v.is_null() { stats.null_count += 1; continue; }
match ktype {
KType::Int | KType::Float | KType::Bool => {
let n = v.as_i64();
if n < stats.min_i64 { stats.min_i64 = n; }
if n > stats.max_i64 { stats.max_i64 = n; }
}
KType::Str => {
let s = v.as_str();
if first_str {
stats.min_str = s.to_string();
stats.max_str = s.to_string();
first_str = false;
} else {
if s < stats.min_str.as_str() { stats.min_str = s.to_string(); }
if s > stats.max_str.as_str() { stats.max_str = s.to_string(); }
}
}
_ => {}
}
}
if stats.min_i64 == i64::MAX { stats.min_i64 = 0; stats.max_i64 = 0; }
stats
}
fn atomic_write(path: &str, data: &[u8]) -> Result<(), String> {
use std::io::Write;
let tmp = format!("{}.tmp.{}", path, std::process::id());
let mut f = std::fs::File::create(&tmp).map_err(|e| format!("create {}: {}", tmp, e))?;
f.write_all(data).map_err(|e| format!("write {}: {}", tmp, e))?;
f.sync_all().map_err(|e| format!("fsync {}: {}", tmp, e))?;
if std::path::Path::new(path).exists() {
std::fs::remove_file(path).map_err(|e| format!("remove {}: {}", path, e))?;
}
std::fs::rename(&tmp, path).map_err(|e| format!("rename {} -> {}: {}", tmp, path, e))?;
Ok(())
}
#[cfg(test)]
#[allow(clippy::items_after_test_module)]
mod tests {
use super::*;
fn temp_path(name: &str) -> String {
let mut path = std::env::temp_dir();
let stamp = std::time::SystemTime::now()
.duration_since(std::time::UNIX_EPOCH)
.unwrap_or_default()
.as_micros();
path.push(format!("{}_{}.kore", name, stamp));
path.to_string_lossy().into_owned()
}
#[test]
fn write_read_roundtrip_basic() {
let col = KColumn::new("id", KType::Int);
let writer = KoreWriter::with_chunk_size(vec![col], 1024);
let rows = vec![vec![KVal::Int(1)], vec![KVal::Int(2)]];
let p = temp_path("kore_rt_basic");
let res = writer.write(&p, &rows);
assert!(res.is_ok(), "writer error: {:?}", res);
let rdr = KoreReader::open(&p).expect("open reader");
let cols = rdr.read_all_columns();
assert_eq!(cols.len(), 1);
assert_eq!(cols[0].len(), 2);
match cols[0][0] {
KVal::Int(x) => assert_eq!(x, 1),
_ => panic!("expected int"),
}
let _ = std::fs::remove_file(&p);
}
#[test]
fn write_read_roundtrip_mixed_types() {
let cols = vec![
KColumn::new("id", KType::Int),
KColumn::new("name", KType::Str),
KColumn::new("flag", KType::Bool),
KColumn::new("score", KType::Float),
];
let writer = KoreWriter::with_chunk_size(cols, 1024);
let rows = vec![
vec![KVal::Int(1), KVal::Str("alice".to_string()), KVal::Bool(true), KVal::Float(3.5)],
vec![KVal::Int(2), KVal::Str("bob".to_string()), KVal::Bool(false), KVal::Float(7.25)],
vec![KVal::Int(3), KVal::Str("carol".to_string()), KVal::Bool(true), KVal::Float(0.5)],
];
let p = temp_path("kore_rt_mixed");
writer.write(&p, &rows).expect("write mixed");
let rdr = KoreReader::open(&p).expect("open mixed");
let out = rdr.read_all_columns();
assert_eq!(out.len(), 4);
assert_eq!(out[0].len(), 3);
match &out[1][1] {
KVal::Str(s) => assert_eq!(s, "bob"),
_ => panic!("expected string"),
}
match out[2][0] {
KVal::Bool(b) => assert!(b),
_ => panic!("expected bool"),
}
let _ = std::fs::remove_file(&p);
}
#[test]
fn multi_chunk_row_access_works() {
let cols = vec![KColumn::new("id", KType::Int), KColumn::new("name", KType::Str)];
let writer = KoreWriter::with_chunk_size(cols, 2);
let rows = vec![
vec![KVal::Int(10), KVal::Str("r0".to_string())],
vec![KVal::Int(11), KVal::Str("r1".to_string())],
vec![KVal::Int(12), KVal::Str("r2".to_string())],
vec![KVal::Int(13), KVal::Str("r3".to_string())],
vec![KVal::Int(14), KVal::Str("r4".to_string())],
];
let p = temp_path("kore_rt_chunks");
writer.write(&p, &rows).expect("write chunks");
let rdr = KoreReader::open(&p).expect("open chunks");
assert_eq!(rdr.nchunks, 3);
let row = rdr.read_row(3).expect("row 3");
match row[0] {
KVal::Int(v) => assert_eq!(v, 13),
_ => panic!("expected int row value"),
}
let range = rdr.read_row_range(1, 4);
assert_eq!(range.len(), 3);
let _ = std::fs::remove_file(&p);
}
#[test]
fn corrupted_block_yields_nulls_not_panic() {
let col = KColumn::new("id", KType::Int);
let writer = KoreWriter::with_chunk_size(vec![col], 1024);
let rows = vec![vec![KVal::Int(1)], vec![KVal::Int(2)], vec![KVal::Int(3)]];
let p = temp_path("kore_rt_corrupt");
writer.write(&p, &rows).expect("write corrupt target");
let mut bytes = std::fs::read(&p).expect("read file");
if bytes.len() > 100 {
bytes[90] ^= 0xAA;
}
std::fs::write(&p, &bytes).expect("rewrite corrupted");
let rdr = KoreReader::open(&p).expect("open corrupted");
let out = rdr.read_all_columns();
assert_eq!(out.len(), 1);
assert_eq!(out[0].len(), 3);
let _ = std::fs::remove_file(&p);
}
}
pub struct KoreWriter {
pub columns: Vec<KColumn>,
pub chunk_size: usize,
}
impl KoreWriter {
pub fn new(columns: Vec<KColumn>) -> Self {
KoreWriter { columns, chunk_size: DEFAULT_CHUNK_SIZE }
}
pub fn with_chunk_size(columns: Vec<KColumn>, chunk_size: usize) -> Self {
KoreWriter { columns, chunk_size: chunk_size.max(1) }
}
pub fn write(&self, path: &str, rows: &[Vec<KVal>]) -> Result<String, String> {
if rows.is_empty() { return Err("No rows to write".to_string()); }
let ncols = self.columns.len();
let nrows = rows.len();
let nchunks = nrows.div_ceil(self.chunk_size);
let mut dict_map: HashMap<String, u32> = HashMap::new();
let mut dict_list: Vec<String> = Vec::new();
for row in rows {
for (ci, val) in row.iter().enumerate() {
if ci < ncols && self.columns[ci].ktype == KType::Str {
let s = val.as_str().to_string();
if !dict_map.contains_key(&s) {
let idx = dict_list.len() as u32;
dict_map.insert(s.clone(), idx);
dict_list.push(s);
}
}
}
}
let mut out: Vec<u8> = Vec::with_capacity(nrows * ncols * 4);
out.extend_from_slice(KORE_MAGIC); out.push(KORE_V2); out.push(0u8); out.extend_from_slice(&(ncols as u16).to_le_bytes()); out.extend_from_slice(&(nrows as u64).to_le_bytes()); out.extend_from_slice(&(nchunks as u32).to_le_bytes()); out.extend_from_slice(&(self.chunk_size as u32).to_le_bytes()); let ts = std::time::SystemTime::now()
.duration_since(std::time::UNIX_EPOCH).unwrap_or_default().as_secs();
out.extend_from_slice(&ts.to_le_bytes()); out.extend_from_slice(&[0u8; 32]);
let mut schema_raw = Vec::new();
for col in &self.columns {
let nb = col.name.as_bytes();
write_varint(&mut schema_raw, nb.len() as u64);
schema_raw.extend_from_slice(nb);
schema_raw.push(col.ktype as u8);
schema_raw.push(if col.encrypted { 1 } else { 0 });
}
let schema_comp = compress_block(&schema_raw);
out.extend_from_slice(&(schema_comp.len() as u32).to_le_bytes());
out.extend_from_slice(&schema_comp);
let mut dict_raw = Vec::new();
write_varint(&mut dict_raw, dict_list.len() as u64);
for entry in &dict_list {
let b = entry.as_bytes();
write_varint(&mut dict_raw, b.len() as u64);
dict_raw.extend_from_slice(b);
}
let dict_comp = compress_block(&dict_raw);
out.extend_from_slice(&(dict_comp.len() as u32).to_le_bytes());
out.extend_from_slice(&dict_comp);
struct ChunkColMeta {
file_offset: u64,
comp_len: u32,
codec: u8,
stats: ColStats,
bloom: Bloom,
}
let mut all_meta: Vec<Vec<ChunkColMeta>> = Vec::with_capacity(nchunks);
for chunk_idx in 0..nchunks {
let rstart = chunk_idx * self.chunk_size;
let rend = (rstart + self.chunk_size).min(nrows);
let chunk_rows = &rows[rstart..rend];
let _chunk_nrows = chunk_rows.len();
let mut chunk_meta = Vec::with_capacity(ncols);
for (ci, col) in self.columns.iter().enumerate().take(ncols) {
let vals: Vec<KVal> = chunk_rows.iter()
.map(|r| r.get(ci).cloned().unwrap_or(KVal::Null))
.collect();
let codec = match col.ktype {
KType::Bool => Codec::Bitpack,
KType::Int | KType::Float => {
let nums: Vec<i64> = if col.ktype == KType::Float {
vals.iter().map(|v| (v.as_f64() * 10000.0).round() as i64).collect()
} else {
vals.iter().map(|v| v.as_i64()).collect()
};
select_int_codec(&nums)
}
KType::Str => {
let strs: Vec<&str> = vals.iter().map(|v| v.as_str()).collect();
select_str_codec(&strs)
}
_ => Codec::Raw,
};
let stats = compute_stats(&vals, col.ktype);
let mut bloom = Bloom::new();
if col.ktype == KType::Str {
for v in &vals { bloom.insert(v.as_str()); }
}
let codec_data = encode_column_data(&vals, col, codec, &dict_map);
let codec_data = if col.encrypted {
let nonce = derive_nonce(&col.name, chunk_idx);
aes256_ctr(&codec_data, &col.enc_key, &nonce)
} else {
codec_data
};
let compressed = compress_block(&codec_data);
let checksum = crc32(&compressed);
let file_offset = out.len() as u64;
out.extend_from_slice(&checksum.to_le_bytes());
out.extend_from_slice(&(compressed.len() as u32).to_le_bytes());
out.extend_from_slice(&compressed);
chunk_meta.push(ChunkColMeta {
file_offset,
comp_len: compressed.len() as u32,
codec: codec as u8,
stats,
bloom,
});
}
all_meta.push(chunk_meta);
}
let mut footer_raw = Vec::new();
footer_raw.extend_from_slice(&(nchunks as u32).to_le_bytes());
footer_raw.extend_from_slice(&(ncols as u16).to_le_bytes());
for chunk_idx in 0..nchunks {
let rstart = chunk_idx * self.chunk_size;
let rend = (rstart + self.chunk_size).min(nrows);
footer_raw.extend_from_slice(&((rend - rstart) as u32).to_le_bytes());
}
for chunk_meta in &all_meta {
for cm in chunk_meta {
footer_raw.extend_from_slice(&cm.file_offset.to_le_bytes());
footer_raw.extend_from_slice(&cm.comp_len.to_le_bytes());
footer_raw.push(cm.codec);
footer_raw.extend_from_slice(&cm.stats.null_count.to_le_bytes());
write_zvar(&mut footer_raw, cm.stats.min_i64);
write_zvar(&mut footer_raw, cm.stats.max_i64);
let min_b = cm.stats.min_str.as_bytes();
write_varint(&mut footer_raw, min_b.len() as u64);
footer_raw.extend_from_slice(min_b);
let max_b = cm.stats.max_str.as_bytes();
write_varint(&mut footer_raw, max_b.len() as u64);
footer_raw.extend_from_slice(max_b);
footer_raw.extend_from_slice(&cm.bloom.to_bytes());
}
}
let footer_comp = compress_block(&footer_raw);
let footer_offset = out.len() as u64;
out.extend_from_slice(&footer_comp);
out.extend_from_slice(&(footer_comp.len() as u32).to_le_bytes());
out.extend_from_slice(&footer_offset.to_le_bytes());
atomic_write(path, &out)
.map_err(|e| format!("Cannot write {}: {}", path, e))?;
let ratio = if nrows > 0 {
let raw_est: usize = rows.iter()
.flat_map(|r| r.iter().map(|v| v.display().len() + 1))
.sum();
if raw_est > 0 { out.len() as f64 / raw_est as f64 * 100.0 } else { 100.0 }
} else { 100.0 };
Ok(format!(
"KORE v2: {} rows × {} cols | {} chunks | {} bytes ({:.1}% of raw) | dict: {} entries",
nrows, ncols, nchunks, out.len(), ratio, dict_list.len()
))
}
pub fn write_columns(&self, path: &str, cols: &[Vec<KVal>], nrows: usize) -> Result<String, String> {
if nrows == 0 { return Err("No rows to write".to_string()); }
let ncols = self.columns.len();
let nchunks = nrows.div_ceil(self.chunk_size);
let mut dict_map: HashMap<String, u32> = HashMap::new();
let mut dict_list: Vec<String> = Vec::new();
for (ci, col) in self.columns.iter().enumerate().take(ncols) {
if col.ktype == KType::Str {
for v in &cols[ci] {
let s = v.as_str().to_string();
if !dict_map.contains_key(&s) {
let idx = dict_list.len() as u32;
dict_map.insert(s.clone(), idx);
dict_list.push(s);
}
}
}
}
let mut out: Vec<u8> = Vec::with_capacity(nrows * ncols * 4);
out.extend_from_slice(KORE_MAGIC);
out.push(KORE_V2);
out.push(0u8);
out.extend_from_slice(&(ncols as u16).to_le_bytes());
out.extend_from_slice(&(nrows as u64).to_le_bytes());
out.extend_from_slice(&(nchunks as u32).to_le_bytes());
out.extend_from_slice(&(self.chunk_size as u32).to_le_bytes());
let ts = std::time::SystemTime::now()
.duration_since(std::time::UNIX_EPOCH).unwrap_or_default().as_secs();
out.extend_from_slice(&ts.to_le_bytes());
out.extend_from_slice(&[0u8; 32]);
let mut schema_raw = Vec::new();
for col in &self.columns {
let nb = col.name.as_bytes();
write_varint(&mut schema_raw, nb.len() as u64);
schema_raw.extend_from_slice(nb);
schema_raw.push(col.ktype as u8);
schema_raw.push(if col.encrypted { 1 } else { 0 });
}
let schema_comp = compress_block(&schema_raw);
out.extend_from_slice(&(schema_comp.len() as u32).to_le_bytes());
out.extend_from_slice(&schema_comp);
let mut dict_raw = Vec::new();
write_varint(&mut dict_raw, dict_list.len() as u64);
for entry in &dict_list {
let b = entry.as_bytes();
write_varint(&mut dict_raw, b.len() as u64);
dict_raw.extend_from_slice(b);
}
let dict_comp = compress_block(&dict_raw);
out.extend_from_slice(&(dict_comp.len() as u32).to_le_bytes());
out.extend_from_slice(&dict_comp);
struct ChunkColMeta {
file_offset: u64,
comp_len: u32,
codec: u8,
stats: ColStats,
bloom: Bloom,
}
let mut all_meta: Vec<Vec<ChunkColMeta>> = Vec::with_capacity(nchunks);
for chunk_idx in 0..nchunks {
let rstart = chunk_idx * self.chunk_size;
let rend = (rstart + self.chunk_size).min(nrows);
let mut chunk_meta = Vec::with_capacity(ncols);
for (ci, col) in self.columns.iter().enumerate().take(ncols) {
let vals = &cols[ci][rstart..rend];
let codec = match col.ktype {
KType::Bool => Codec::Bitpack,
KType::Int | KType::Float => {
let nums: Vec<i64> = if col.ktype == KType::Float {
vals.iter().map(|v| (v.as_f64() * 10000.0).round() as i64).collect()
} else {
vals.iter().map(|v| v.as_i64()).collect()
};
select_int_codec(&nums)
}
KType::Str => {
let strs: Vec<&str> = vals.iter().map(|v| v.as_str()).collect();
select_str_codec(&strs)
}
_ => Codec::Raw,
};
let stats = compute_stats(vals, col.ktype);
let mut bloom = Bloom::new();
if col.ktype == KType::Str {
for v in vals { bloom.insert(v.as_str()); }
}
let codec_data = encode_column_data(vals, col, codec, &dict_map);
let codec_data = if col.encrypted {
let nonce = derive_nonce(&col.name, chunk_idx);
aes256_ctr(&codec_data, &col.enc_key, &nonce)
} else {
codec_data
};
let compressed = compress_block(&codec_data);
let checksum = crc32(&compressed);
let file_offset = out.len() as u64;
out.extend_from_slice(&checksum.to_le_bytes());
out.extend_from_slice(&(compressed.len() as u32).to_le_bytes());
out.extend_from_slice(&compressed);
chunk_meta.push(ChunkColMeta {
file_offset,
comp_len: compressed.len() as u32,
codec: codec as u8,
stats,
bloom,
});
}
all_meta.push(chunk_meta);
}
let mut footer_raw = Vec::new();
footer_raw.extend_from_slice(&(nchunks as u32).to_le_bytes());
footer_raw.extend_from_slice(&(ncols as u16).to_le_bytes());
for chunk_idx in 0..nchunks {
let rstart = chunk_idx * self.chunk_size;
let rend = (rstart + self.chunk_size).min(nrows);
footer_raw.extend_from_slice(&((rend - rstart) as u32).to_le_bytes());
}
for chunk_meta in &all_meta {
for cm in chunk_meta {
footer_raw.extend_from_slice(&cm.file_offset.to_le_bytes());
footer_raw.extend_from_slice(&cm.comp_len.to_le_bytes());
footer_raw.push(cm.codec);
footer_raw.extend_from_slice(&cm.stats.null_count.to_le_bytes());
write_zvar(&mut footer_raw, cm.stats.min_i64);
write_zvar(&mut footer_raw, cm.stats.max_i64);
let min_b = cm.stats.min_str.as_bytes();
write_varint(&mut footer_raw, min_b.len() as u64);
footer_raw.extend_from_slice(min_b);
let max_b = cm.stats.max_str.as_bytes();
write_varint(&mut footer_raw, max_b.len() as u64);
footer_raw.extend_from_slice(max_b);
footer_raw.extend_from_slice(&cm.bloom.to_bytes());
}
}
let footer_comp = compress_block(&footer_raw);
let footer_offset = out.len() as u64;
out.extend_from_slice(&footer_comp);
out.extend_from_slice(&(footer_comp.len() as u32).to_le_bytes());
out.extend_from_slice(&footer_offset.to_le_bytes());
atomic_write(path, &out)
.map_err(|e| format!("Cannot write {}: {}", path, e))?;
Ok(format!(
"KORE v2: {} rows × {} cols | {} chunks | {} bytes ({:.1}% of raw) | dict: {} entries",
nrows, ncols, nchunks, out.len(),
out.len() as f64 / (nrows * ncols * 8).max(1) as f64 * 100.0,
dict_list.len()
))
}
}