use crate::error::InsightError;
#[derive(Debug, Clone, PartialEq)]
pub struct ValidityBitmap {
bits: Vec<u64>,
len: usize,
}
impl ValidityBitmap {
pub fn all_valid(len: usize) -> Self {
let n_words = len.div_ceil(64);
let mut bits = vec![u64::MAX; n_words];
let trailing = len % 64;
if trailing != 0 && n_words > 0 {
bits[n_words - 1] = (1u64 << trailing) - 1;
}
Self { bits, len }
}
pub fn all_invalid(len: usize) -> Self {
let n_words = len.div_ceil(64);
Self {
bits: vec![0u64; n_words],
len,
}
}
pub fn empty() -> Self {
Self {
bits: Vec::new(),
len: 0,
}
}
#[inline]
pub fn is_valid(&self, idx: usize) -> bool {
debug_assert!(
idx < self.len,
"index {idx} out of bounds (len={})",
self.len
);
let (word, bit) = (idx / 64, idx % 64);
(self.bits[word] >> bit) & 1 == 1
}
#[inline]
pub fn set_valid(&mut self, idx: usize) {
debug_assert!(
idx < self.len,
"index {idx} out of bounds (len={})",
self.len
);
let (word, bit) = (idx / 64, idx % 64);
self.bits[word] |= 1u64 << bit;
}
#[inline]
pub fn set_invalid(&mut self, idx: usize) {
debug_assert!(
idx < self.len,
"index {idx} out of bounds (len={})",
self.len
);
let (word, bit) = (idx / 64, idx % 64);
self.bits[word] &= !(1u64 << bit);
}
pub fn push(&mut self, valid: bool) {
let idx = self.len;
self.len += 1;
let word = idx / 64;
let bit = idx % 64;
if word >= self.bits.len() {
self.bits.push(0);
}
if valid {
self.bits[word] |= 1u64 << bit;
}
}
#[inline]
pub fn len(&self) -> usize {
self.len
}
#[inline]
pub fn is_empty(&self) -> bool {
self.len == 0
}
pub fn null_count(&self) -> usize {
let valid_count: usize = self.bits.iter().map(|w| w.count_ones() as usize).sum();
self.len - valid_count
}
pub fn valid_count(&self) -> usize {
self.len - self.null_count()
}
pub fn has_nulls(&self) -> bool {
self.null_count() > 0
}
pub fn valid_indices(&self) -> ValidIndicesIter<'_> {
ValidIndicesIter {
bitmap: self,
current: 0,
}
}
}
pub struct ValidIndicesIter<'a> {
bitmap: &'a ValidityBitmap,
current: usize,
}
impl<'a> Iterator for ValidIndicesIter<'a> {
type Item = usize;
fn next(&mut self) -> Option<usize> {
while self.current < self.bitmap.len {
let idx = self.current;
self.current += 1;
if self.bitmap.is_valid(idx) {
return Some(idx);
}
}
None
}
fn size_hint(&self) -> (usize, Option<usize>) {
(0, Some(self.bitmap.len - self.current))
}
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum DataType {
Numeric,
Boolean,
Categorical,
Text,
}
impl std::fmt::Display for DataType {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
match self {
Self::Numeric => write!(f, "Numeric"),
Self::Boolean => write!(f, "Boolean"),
Self::Categorical => write!(f, "Categorical"),
Self::Text => write!(f, "Text"),
}
}
}
#[derive(Debug, Clone, PartialEq)]
pub enum Column {
Numeric {
values: Vec<f64>,
validity: ValidityBitmap,
},
Boolean {
values: Vec<bool>,
validity: ValidityBitmap,
},
Categorical {
dictionary: Vec<String>,
indices: Vec<u32>,
validity: ValidityBitmap,
},
Text {
values: Vec<String>,
validity: ValidityBitmap,
},
}
impl Column {
pub fn numeric(values: Vec<f64>, validity: ValidityBitmap) -> Self {
Self::Numeric { values, validity }
}
pub fn boolean(values: Vec<bool>, validity: ValidityBitmap) -> Self {
Self::Boolean { values, validity }
}
pub fn categorical(
dictionary: Vec<String>,
indices: Vec<u32>,
validity: ValidityBitmap,
) -> Self {
Self::Categorical {
dictionary,
indices,
validity,
}
}
pub fn text(values: Vec<String>, validity: ValidityBitmap) -> Self {
Self::Text { values, validity }
}
pub fn data_type(&self) -> DataType {
match self {
Self::Numeric { .. } => DataType::Numeric,
Self::Boolean { .. } => DataType::Boolean,
Self::Categorical { .. } => DataType::Categorical,
Self::Text { .. } => DataType::Text,
}
}
pub fn len(&self) -> usize {
self.validity().len()
}
pub fn is_empty(&self) -> bool {
self.len() == 0
}
pub fn validity(&self) -> &ValidityBitmap {
match self {
Self::Numeric { validity, .. }
| Self::Boolean { validity, .. }
| Self::Categorical { validity, .. }
| Self::Text { validity, .. } => validity,
}
}
pub fn null_count(&self) -> usize {
self.validity().null_count()
}
pub fn valid_count(&self) -> usize {
self.validity().valid_count()
}
pub fn is_valid(&self, idx: usize) -> bool {
self.validity().is_valid(idx)
}
pub fn as_numeric(&self) -> Option<&[f64]> {
match self {
Self::Numeric { values, .. } => Some(values),
_ => None,
}
}
pub fn as_boolean(&self) -> Option<&[bool]> {
match self {
Self::Boolean { values, .. } => Some(values),
_ => None,
}
}
pub fn valid_numeric_values(&self) -> Option<Vec<f64>> {
match self {
Self::Numeric { values, validity } => {
let result: Vec<f64> = validity.valid_indices().map(|i| values[i]).collect();
Some(result)
}
_ => None,
}
}
pub fn category_at(&self, idx: usize) -> Option<&str> {
match self {
Self::Categorical {
dictionary,
indices,
validity,
} => {
if validity.is_valid(idx) {
dictionary.get(indices[idx] as usize).map(|s| s.as_str())
} else {
None
}
}
_ => None,
}
}
pub fn text_at(&self, idx: usize) -> Option<&str> {
match self {
Self::Text { values, validity } => {
if validity.is_valid(idx) {
Some(&values[idx])
} else {
None
}
}
_ => None,
}
}
}
#[derive(Debug, Clone)]
pub struct DataFrame {
names: Vec<String>,
columns: Vec<Column>,
row_count: usize,
}
impl DataFrame {
pub fn new() -> Self {
Self {
names: Vec::new(),
columns: Vec::new(),
row_count: 0,
}
}
pub fn add_column(&mut self, name: String, column: Column) -> Result<(), InsightError> {
let col_len = column.len();
if self.columns.is_empty() {
self.row_count = col_len;
} else if col_len != self.row_count {
return Err(InsightError::DimensionMismatch {
expected: self.row_count,
actual: col_len,
});
}
self.names.push(name);
self.columns.push(column);
Ok(())
}
#[inline]
pub fn row_count(&self) -> usize {
self.row_count
}
#[inline]
pub fn column_count(&self) -> usize {
self.columns.len()
}
pub fn is_empty(&self) -> bool {
self.columns.is_empty()
}
pub fn column_names(&self) -> &[String] {
&self.names
}
pub fn column(&self, index: usize) -> Option<&Column> {
self.columns.get(index)
}
pub fn column_by_name(&self, name: &str) -> Option<&Column> {
self.column_index(name).map(|i| &self.columns[i])
}
pub fn column_index(&self, name: &str) -> Option<usize> {
self.names.iter().position(|n| n == name)
}
pub fn iter(&self) -> impl Iterator<Item = (&str, &Column)> {
self.names
.iter()
.map(|s| s.as_str())
.zip(self.columns.iter())
}
pub fn schema(&self) -> Vec<(&str, DataType)> {
self.names
.iter()
.zip(self.columns.iter())
.map(|(name, col)| (name.as_str(), col.data_type()))
.collect()
}
pub fn total_null_count(&self) -> usize {
self.columns.iter().map(|c| c.null_count()).sum()
}
}
impl Default for DataFrame {
fn default() -> Self {
Self::new()
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn bitmap_all_valid() {
let bm = ValidityBitmap::all_valid(100);
assert_eq!(bm.len(), 100);
assert_eq!(bm.null_count(), 0);
assert_eq!(bm.valid_count(), 100);
for i in 0..100 {
assert!(bm.is_valid(i));
}
}
#[test]
fn bitmap_all_invalid() {
let bm = ValidityBitmap::all_invalid(100);
assert_eq!(bm.null_count(), 100);
assert_eq!(bm.valid_count(), 0);
for i in 0..100 {
assert!(!bm.is_valid(i));
}
}
#[test]
fn bitmap_set_operations() {
let mut bm = ValidityBitmap::all_valid(10);
bm.set_invalid(3);
bm.set_invalid(7);
assert_eq!(bm.null_count(), 2);
assert!(!bm.is_valid(3));
assert!(!bm.is_valid(7));
assert!(bm.is_valid(0));
assert!(bm.is_valid(9));
bm.set_valid(3);
assert!(bm.is_valid(3));
assert_eq!(bm.null_count(), 1);
}
#[test]
fn bitmap_push() {
let mut bm = ValidityBitmap::empty();
bm.push(true);
bm.push(false);
bm.push(true);
assert_eq!(bm.len(), 3);
assert!(bm.is_valid(0));
assert!(!bm.is_valid(1));
assert!(bm.is_valid(2));
assert_eq!(bm.null_count(), 1);
}
#[test]
fn bitmap_boundary_64() {
let bm = ValidityBitmap::all_valid(64);
assert_eq!(bm.bits.len(), 1);
assert_eq!(bm.null_count(), 0);
let bm65 = ValidityBitmap::all_valid(65);
assert_eq!(bm65.bits.len(), 2);
assert_eq!(bm65.null_count(), 0);
assert!(bm65.is_valid(64));
}
#[test]
fn bitmap_push_across_word_boundary() {
let mut bm = ValidityBitmap::empty();
for i in 0..128 {
bm.push(i % 3 != 0); }
assert_eq!(bm.len(), 128);
let expected_nulls = (0..128).filter(|i| i % 3 == 0).count();
assert_eq!(bm.null_count(), expected_nulls);
}
#[test]
fn bitmap_valid_indices() {
let mut bm = ValidityBitmap::all_valid(5);
bm.set_invalid(1);
bm.set_invalid(3);
let indices: Vec<usize> = bm.valid_indices().collect();
assert_eq!(indices, vec![0, 2, 4]);
}
#[test]
fn numeric_column_basics() {
let col = Column::numeric(vec![1.0, 2.0, 3.0], ValidityBitmap::all_valid(3));
assert_eq!(col.data_type(), DataType::Numeric);
assert_eq!(col.len(), 3);
assert_eq!(col.null_count(), 0);
assert_eq!(col.as_numeric(), Some(&[1.0, 2.0, 3.0][..]));
}
#[test]
fn numeric_column_with_nulls() {
let mut validity = ValidityBitmap::all_valid(4);
validity.set_invalid(1);
validity.set_invalid(3);
let col = Column::numeric(vec![1.0, 0.0, 3.0, 0.0], validity);
assert_eq!(col.null_count(), 2);
assert_eq!(col.valid_count(), 2);
assert!(col.is_valid(0));
assert!(!col.is_valid(1));
let valid = col.valid_numeric_values().expect("numeric column");
assert_eq!(valid, vec![1.0, 3.0]);
}
#[test]
fn boolean_column() {
let col = Column::boolean(vec![true, false, true], ValidityBitmap::all_valid(3));
assert_eq!(col.data_type(), DataType::Boolean);
assert_eq!(col.as_boolean(), Some(&[true, false, true][..]));
}
#[test]
fn categorical_column() {
let dict = vec!["low".into(), "med".into(), "high".into()];
let indices = vec![0, 1, 2, 1, 0];
let col = Column::categorical(dict, indices, ValidityBitmap::all_valid(5));
assert_eq!(col.data_type(), DataType::Categorical);
assert_eq!(col.category_at(0), Some("low"));
assert_eq!(col.category_at(1), Some("med"));
assert_eq!(col.category_at(2), Some("high"));
assert_eq!(col.category_at(3), Some("med"));
}
#[test]
fn categorical_column_with_null() {
let dict = vec!["a".into(), "b".into()];
let indices = vec![0, 0, 1];
let mut validity = ValidityBitmap::all_valid(3);
validity.set_invalid(1);
let col = Column::categorical(dict, indices, validity);
assert_eq!(col.category_at(0), Some("a"));
assert_eq!(col.category_at(1), None);
assert_eq!(col.category_at(2), Some("b"));
}
#[test]
fn text_column() {
let col = Column::text(
vec!["hello".into(), "world".into()],
ValidityBitmap::all_valid(2),
);
assert_eq!(col.data_type(), DataType::Text);
assert_eq!(col.text_at(0), Some("hello"));
assert_eq!(col.text_at(1), Some("world"));
}
#[test]
fn text_column_with_null() {
let mut validity = ValidityBitmap::all_valid(2);
validity.set_invalid(0);
let col = Column::text(vec![String::new(), "world".into()], validity);
assert_eq!(col.text_at(0), None);
assert_eq!(col.text_at(1), Some("world"));
}
#[test]
fn empty_dataframe() {
let df = DataFrame::new();
assert_eq!(df.row_count(), 0);
assert_eq!(df.column_count(), 0);
assert!(df.is_empty());
}
#[test]
fn add_columns() {
let mut df = DataFrame::new();
df.add_column(
"x".to_string(),
Column::numeric(vec![1.0, 2.0, 3.0], ValidityBitmap::all_valid(3)),
)
.expect("first column");
df.add_column(
"y".to_string(),
Column::numeric(vec![4.0, 5.0, 6.0], ValidityBitmap::all_valid(3)),
)
.expect("second column");
assert_eq!(df.row_count(), 3);
assert_eq!(df.column_count(), 2);
assert_eq!(df.column_names(), &["x", "y"]);
}
#[test]
fn column_length_mismatch() {
let mut df = DataFrame::new();
df.add_column(
"x".to_string(),
Column::numeric(vec![1.0, 2.0], ValidityBitmap::all_valid(2)),
)
.unwrap();
let result = df.add_column(
"y".to_string(),
Column::numeric(vec![1.0, 2.0, 3.0], ValidityBitmap::all_valid(3)),
);
assert!(result.is_err());
}
#[test]
fn column_by_name_lookup() {
let mut df = DataFrame::new();
df.add_column(
"temp".to_string(),
Column::numeric(vec![20.5, 21.3], ValidityBitmap::all_valid(2)),
)
.unwrap();
let col = df.column_by_name("temp").expect("found");
assert_eq!(col.data_type(), DataType::Numeric);
assert!(df.column_by_name("missing").is_none());
}
#[test]
fn dataframe_schema() {
let mut df = DataFrame::new();
df.add_column(
"x".to_string(),
Column::numeric(vec![1.0], ValidityBitmap::all_valid(1)),
)
.unwrap();
df.add_column(
"ok".to_string(),
Column::boolean(vec![true], ValidityBitmap::all_valid(1)),
)
.unwrap();
df.add_column(
"label".to_string(),
Column::text(vec!["a".into()], ValidityBitmap::all_valid(1)),
)
.unwrap();
let schema = df.schema();
assert_eq!(schema[0], ("x", DataType::Numeric));
assert_eq!(schema[1], ("ok", DataType::Boolean));
assert_eq!(schema[2], ("label", DataType::Text));
}
#[test]
fn total_null_count() {
let mut df = DataFrame::new();
let mut v1 = ValidityBitmap::all_valid(3);
v1.set_invalid(1);
let mut v2 = ValidityBitmap::all_valid(3);
v2.set_invalid(0);
v2.set_invalid(2);
df.add_column("a".into(), Column::numeric(vec![1.0, 0.0, 3.0], v1))
.unwrap();
df.add_column("b".into(), Column::numeric(vec![0.0, 5.0, 0.0], v2))
.unwrap();
assert_eq!(df.total_null_count(), 3);
}
#[test]
fn dataframe_iter() {
let mut df = DataFrame::new();
df.add_column(
"x".into(),
Column::numeric(vec![1.0], ValidityBitmap::all_valid(1)),
)
.unwrap();
df.add_column(
"y".into(),
Column::numeric(vec![2.0], ValidityBitmap::all_valid(1)),
)
.unwrap();
let pairs: Vec<(&str, DataType)> = df.iter().map(|(n, c)| (n, c.data_type())).collect();
assert_eq!(
pairs,
vec![("x", DataType::Numeric), ("y", DataType::Numeric)]
);
}
}