use std::collections::HashMap;
use std::fmt;
use crate::error::{IoError, Result};
pub const COLUMNAR_MAGIC: &[u8; 8] = b"SCIRCOL\x01";
pub const FORMAT_VERSION: u32 = 1;
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
#[repr(u8)]
pub enum ColumnTypeTag {
Float64 = 0,
Int64 = 1,
Str = 2,
Bool = 3,
}
impl TryFrom<u8> for ColumnTypeTag {
type Error = IoError;
fn try_from(value: u8) -> std::result::Result<Self, Self::Error> {
match value {
0 => Ok(ColumnTypeTag::Float64),
1 => Ok(ColumnTypeTag::Int64),
2 => Ok(ColumnTypeTag::Str),
3 => Ok(ColumnTypeTag::Bool),
_ => Err(IoError::FormatError(format!(
"Unknown column type tag: {}",
value
))),
}
}
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
#[repr(u8)]
pub enum EncodingType {
Plain = 0,
Rle = 1,
Dictionary = 2,
Delta = 3,
}
impl TryFrom<u8> for EncodingType {
type Error = IoError;
fn try_from(value: u8) -> std::result::Result<Self, Self::Error> {
match value {
0 => Ok(EncodingType::Plain),
1 => Ok(EncodingType::Rle),
2 => Ok(EncodingType::Dictionary),
3 => Ok(EncodingType::Delta),
_ => Err(IoError::FormatError(format!(
"Unknown encoding type: {}",
value
))),
}
}
}
#[derive(Debug, Clone)]
pub enum ColumnData {
Float64(Vec<f64>),
Int64(Vec<i64>),
Str(Vec<String>),
Bool(Vec<bool>),
}
impl ColumnData {
pub fn len(&self) -> usize {
match self {
ColumnData::Float64(v) => v.len(),
ColumnData::Int64(v) => v.len(),
ColumnData::Str(v) => v.len(),
ColumnData::Bool(v) => v.len(),
}
}
pub fn is_empty(&self) -> bool {
self.len() == 0
}
pub fn type_tag(&self) -> ColumnTypeTag {
match self {
ColumnData::Float64(_) => ColumnTypeTag::Float64,
ColumnData::Int64(_) => ColumnTypeTag::Int64,
ColumnData::Str(_) => ColumnTypeTag::Str,
ColumnData::Bool(_) => ColumnTypeTag::Bool,
}
}
pub fn as_f64(&self) -> Result<&[f64]> {
match self {
ColumnData::Float64(v) => Ok(v),
_ => Err(IoError::ConversionError(format!(
"Column is {:?}, not Float64",
self.type_tag()
))),
}
}
pub fn as_i64(&self) -> Result<&[i64]> {
match self {
ColumnData::Int64(v) => Ok(v),
_ => Err(IoError::ConversionError(format!(
"Column is {:?}, not Int64",
self.type_tag()
))),
}
}
pub fn as_str(&self) -> Result<&[String]> {
match self {
ColumnData::Str(v) => Ok(v),
_ => Err(IoError::ConversionError(format!(
"Column is {:?}, not Str",
self.type_tag()
))),
}
}
pub fn as_bool(&self) -> Result<&[bool]> {
match self {
ColumnData::Bool(v) => Ok(v),
_ => Err(IoError::ConversionError(format!(
"Column is {:?}, not Bool",
self.type_tag()
))),
}
}
pub fn best_encoding(&self) -> EncodingType {
match self {
ColumnData::Float64(v) => {
if is_sorted_f64(v) {
EncodingType::Delta
} else if has_runs_f64(v) {
EncodingType::Rle
} else {
EncodingType::Plain
}
}
ColumnData::Int64(v) => {
if is_sorted_i64(v) {
EncodingType::Delta
} else if has_runs_i64(v) {
EncodingType::Rle
} else {
EncodingType::Plain
}
}
ColumnData::Str(v) => {
let unique_count = count_unique_strings(v);
if unique_count < v.len() / 2 {
EncodingType::Dictionary
} else if has_runs_str(v) {
EncodingType::Rle
} else {
EncodingType::Plain
}
}
ColumnData::Bool(v) => {
if has_runs_bool(v) {
EncodingType::Rle
} else {
EncodingType::Plain
}
}
}
}
}
impl fmt::Display for ColumnData {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
match self {
ColumnData::Float64(v) => write!(f, "Float64[{}]", v.len()),
ColumnData::Int64(v) => write!(f, "Int64[{}]", v.len()),
ColumnData::Str(v) => write!(f, "Str[{}]", v.len()),
ColumnData::Bool(v) => write!(f, "Bool[{}]", v.len()),
}
}
}
#[derive(Debug, Clone)]
pub struct Column {
pub name: String,
pub data: ColumnData,
}
impl Column {
pub fn float64(name: impl Into<String>, data: Vec<f64>) -> Self {
Column {
name: name.into(),
data: ColumnData::Float64(data),
}
}
pub fn int64(name: impl Into<String>, data: Vec<i64>) -> Self {
Column {
name: name.into(),
data: ColumnData::Int64(data),
}
}
pub fn string(name: impl Into<String>, data: Vec<String>) -> Self {
Column {
name: name.into(),
data: ColumnData::Str(data),
}
}
pub fn boolean(name: impl Into<String>, data: Vec<bool>) -> Self {
Column {
name: name.into(),
data: ColumnData::Bool(data),
}
}
pub fn len(&self) -> usize {
self.data.len()
}
pub fn is_empty(&self) -> bool {
self.data.is_empty()
}
}
#[derive(Debug, Clone)]
pub struct ColumnarTable {
columns: Vec<Column>,
index: HashMap<String, usize>,
}
impl ColumnarTable {
pub fn new() -> Self {
ColumnarTable {
columns: Vec::new(),
index: HashMap::new(),
}
}
pub fn from_columns(columns: Vec<Column>) -> Result<Self> {
if !columns.is_empty() {
let expected_len = columns[0].len();
for col in &columns[1..] {
if col.len() != expected_len {
return Err(IoError::FormatError(format!(
"Column '{}' has {} rows, expected {}",
col.name,
col.len(),
expected_len
)));
}
}
}
let mut index = HashMap::new();
for (i, col) in columns.iter().enumerate() {
if index.contains_key(&col.name) {
return Err(IoError::FormatError(format!(
"Duplicate column name: '{}'",
col.name
)));
}
index.insert(col.name.clone(), i);
}
Ok(ColumnarTable { columns, index })
}
pub fn add_column(&mut self, column: Column) -> Result<()> {
if !self.columns.is_empty() && column.len() != self.num_rows() {
return Err(IoError::FormatError(format!(
"Column '{}' has {} rows, expected {}",
column.name,
column.len(),
self.num_rows()
)));
}
if self.index.contains_key(&column.name) {
return Err(IoError::FormatError(format!(
"Duplicate column name: '{}'",
column.name
)));
}
let idx = self.columns.len();
self.index.insert(column.name.clone(), idx);
self.columns.push(column);
Ok(())
}
pub fn num_rows(&self) -> usize {
self.columns.first().map(|c| c.len()).unwrap_or(0)
}
pub fn num_columns(&self) -> usize {
self.columns.len()
}
pub fn column_names(&self) -> Vec<&str> {
self.columns.iter().map(|c| c.name.as_str()).collect()
}
pub fn column(&self, name: &str) -> Result<&Column> {
self.index
.get(name)
.map(|&idx| &self.columns[idx])
.ok_or_else(|| IoError::NotFound(format!("Column '{}' not found", name)))
}
pub fn column_by_index(&self, idx: usize) -> Result<&Column> {
self.columns
.get(idx)
.ok_or_else(|| IoError::NotFound(format!("Column index {} out of range", idx)))
}
pub fn columns(&self) -> &[Column] {
&self.columns
}
pub fn get_f64(&self, name: &str) -> Result<&[f64]> {
self.column(name)?.data.as_f64()
}
pub fn get_i64(&self, name: &str) -> Result<&[i64]> {
self.column(name)?.data.as_i64()
}
pub fn get_str(&self, name: &str) -> Result<&[String]> {
self.column(name)?.data.as_str()
}
pub fn get_bool(&self, name: &str) -> Result<&[bool]> {
self.column(name)?.data.as_bool()
}
}
impl Default for ColumnarTable {
fn default() -> Self {
Self::new()
}
}
fn is_sorted_f64(data: &[f64]) -> bool {
if data.len() < 2 {
return true;
}
data.windows(2).all(|w| w[0] <= w[1])
}
fn is_sorted_i64(data: &[i64]) -> bool {
if data.len() < 2 {
return true;
}
data.windows(2).all(|w| w[0] <= w[1])
}
fn has_runs_f64(data: &[f64]) -> bool {
if data.len() < 4 {
return false;
}
let mut run_count = 0;
let mut i = 0;
while i < data.len() {
let val = data[i];
let mut run_len = 1;
while i + run_len < data.len() && data[i + run_len] == val {
run_len += 1;
}
if run_len > 1 {
run_count += 1;
}
i += run_len;
}
run_count * 5 >= data.len()
}
fn has_runs_i64(data: &[i64]) -> bool {
if data.len() < 4 {
return false;
}
let mut run_count = 0;
let mut i = 0;
while i < data.len() {
let val = data[i];
let mut run_len = 1;
while i + run_len < data.len() && data[i + run_len] == val {
run_len += 1;
}
if run_len > 1 {
run_count += 1;
}
i += run_len;
}
run_count * 5 >= data.len()
}
fn has_runs_str(data: &[String]) -> bool {
if data.len() < 4 {
return false;
}
let mut run_count = 0;
let mut i = 0;
while i < data.len() {
let val = &data[i];
let mut run_len = 1;
while i + run_len < data.len() && &data[i + run_len] == val {
run_len += 1;
}
if run_len > 1 {
run_count += 1;
}
i += run_len;
}
run_count * 5 >= data.len()
}
fn has_runs_bool(data: &[bool]) -> bool {
if data.len() < 4 {
return false;
}
let mut run_count = 0;
let mut i = 0;
while i < data.len() {
let val = data[i];
let mut run_len = 1;
while i + run_len < data.len() && data[i + run_len] == val {
run_len += 1;
}
if run_len > 1 {
run_count += 1;
}
i += run_len;
}
run_count * 5 >= data.len()
}
fn count_unique_strings(data: &[String]) -> usize {
let mut seen = std::collections::HashSet::new();
for s in data {
seen.insert(s.as_str());
}
seen.len()
}