gguppy_data 0.4.0

Traits and adapters used by gguppy for dataframe access
Documentation
use alloc::{
    boxed::Box,
    format,
    string::{String, ToString},
    vec::Vec,
};
use core::ops::{Add, Sub};

use gguppy_core::data::{ChunkedSlices, DataFrameAdapter, SeriesAdapter, SeriesAdapterError};
use polars::prelude::*;

/// An adapter for Polars [`Series`] that implements the [`SeriesAdapter`] trait.
///
/// ## Design Philosophy
/// - Owns [`Series`] which is a `Arc<dyn SeriesTrait + 'static>`. Clones are shallow copies only.
pub struct PolarsSeriesAdapter(pub Series);

impl SeriesAdapter<'_> for PolarsSeriesAdapter {
    type LibError = PolarsError;

    fn name(&self) -> &str {
        self.0.name()
    }

    fn len(&self) -> usize {
        self.0.len()
    }

    fn is_empty(&self) -> bool {
        self.0.is_empty()
    }

    fn as_chunked_slices(&self) -> Result<ChunkedSlices<'_>, SeriesAdapterError> {
        match self.0.dtype() {
            DataType::Int32 => match self.0.i32() {
                Ok(ca) => {
                    let chunks: Vec<&[i32]> = ca
                        .downcast_iter()
                        .map(|array| array.values().as_slice())
                        .collect();
                    Ok(ChunkedSlices::I32(chunks))
                }
                Err(e) => Err(SeriesAdapterError::DowncastError {
                    expected_type: "Int32".to_string(),
                    actual_type: format!("{:?}", self.0.dtype()),
                    reason: e.to_string(),
                }),
            },
            DataType::Int64 => match self.0.i64() {
                Ok(ca) => {
                    let chunks: Vec<&[i64]> = ca
                        .downcast_iter()
                        .map(|array| array.values().as_slice())
                        .collect();
                    Ok(ChunkedSlices::I64(chunks))
                }
                Err(e) => Err(SeriesAdapterError::DowncastError {
                    expected_type: "Int64".to_string(),
                    actual_type: format!("{:?}", self.0.dtype()),
                    reason: e.to_string(),
                }),
            },
            DataType::Float32 => match self.0.f32() {
                Ok(ca) => {
                    let chunks: Vec<&[f32]> = ca
                        .downcast_iter()
                        .map(|array| array.values().as_slice())
                        .collect();
                    Ok(ChunkedSlices::F32(chunks))
                }
                Err(e) => Err(SeriesAdapterError::DowncastError {
                    expected_type: "Float32".to_string(),
                    actual_type: format!("{:?}", self.0.dtype()),
                    reason: e.to_string(),
                }),
            },
            DataType::Float64 => match self.0.f64() {
                Ok(ca) => {
                    let chunks: Vec<&[f64]> = ca
                        .downcast_iter()
                        .map(|array| array.values().as_slice())
                        .collect();
                    Ok(ChunkedSlices::F64(chunks))
                }
                Err(e) => Err(SeriesAdapterError::DowncastError {
                    expected_type: "Float64".to_string(),
                    actual_type: format!("{:?}", self.0.dtype()),
                    reason: e.to_string(),
                }),
            },
            DataType::Boolean => {
                match self.0.bool() {
                    Ok(ca) => {
                        // For boolean, we need to collect the bit-packed values
                        // Note: This is not zero-copy for booleans due to Polars' bit packing
                        let chunks: Vec<Vec<bool>> = ca
                            .downcast_iter()
                            .map(|array| (0..array.len()).map(|i| array.value(i)).collect())
                            .collect();

                        // Convert to references (this leaks memory for demonstration)
                        // In practice, you'd want a better memory management strategy
                        let chunk_refs: Vec<&[bool]> = chunks
                            .into_iter()
                            .map(|chunk| Box::leak(chunk.into_boxed_slice()) as &[bool])
                            .collect();

                        Ok(ChunkedSlices::Bool(chunk_refs))
                    }
                    Err(e) => Err(SeriesAdapterError::DowncastError {
                        expected_type: "Boolean".to_string(),
                        actual_type: format!("{:?}", self.0.dtype()),
                        reason: e.to_string(),
                    }),
                }
            }
            // Add more types as needed
            _ => Err(SeriesAdapterError::UnsupportedDataType(format!(
                "{:?}",
                self.0.dtype()
            ))),
        }
    }
}

impl Add for PolarsSeriesAdapter {
    type Output = Result<Self, PolarsError>;

    fn add(self, rhs: Self) -> Self::Output {
        (&self.0 + &rhs.0).map(PolarsSeriesAdapter)
    }
}

impl Sub for PolarsSeriesAdapter {
    type Output = Result<Self, PolarsError>;

    fn sub(self, rhs: Self) -> Self::Output {
        (&self.0 - &rhs.0).map(PolarsSeriesAdapter)
    }
}

pub struct PolarsDataFrameAdapter(pub DataFrame);

impl DataFrameAdapter<'_> for PolarsDataFrameAdapter {
    type DataFrame = PolarsDataFrameAdapter;
    type Series = PolarsSeriesAdapter;
    type LibError = PolarsError;

    fn column_names(&self) -> Vec<String> {
        self.0
            .get_column_names_owned()
            .into_iter()
            .map(|s| s.to_string())
            .collect()
    }

    fn col(&self, name: &str) -> Result<Self::Series, Self::LibError> {
        let column = self.0.column(name)?;
        let series = column.as_materialized_series();
        Ok(PolarsSeriesAdapter(series.clone()))
    }

    fn select(&self, names: Vec<&str>) -> Result<Self::DataFrame, Self::LibError> {
        let df = self.0.select(names)?;
        Ok(PolarsDataFrameAdapter(df))
    }

    fn shape(&self) -> (usize, usize) {
        self.0.shape()
    }
}

// TODO: Add into() to automatically convert from DataFrame to PolarsDataFrameAdapter

#[cfg(test)]
mod test {

    use super::*;

    #[test]
    fn series_adapter_name() {
        let series = Series::new("test".into(), &[1, 2, 3]);
        assert_eq!(PolarsSeriesAdapter(series).name(), "test");
    }

    #[test]
    fn series_adapter_len() {
        let series = PolarsSeriesAdapter(Series::new("test".into(), &[1, 2, 3]));
        assert_eq!(series.len(), 3);
        assert!(!series.is_empty());
    }

    #[test]
    fn series_adapter_empty() {
        let series = PolarsSeriesAdapter(Series::new("empty".into(), &[0f32; 0]));
        assert_eq!(series.len(), 0);
        assert!(series.is_empty());
    }

    #[test]
    fn series_adapter_as_chunked_slice() {
        let series = PolarsSeriesAdapter(Series::new("Int32".into(), vec![1i32, 2, 3]));
        let chunked_slices = series.as_chunked_slices();
        assert!(chunked_slices.is_ok());
        assert!(matches!(chunked_slices, Ok(ChunkedSlices::I32(_))));
        if let Ok(ChunkedSlices::I32(chunks)) = chunked_slices {
            assert_eq!(chunks.len(), 1);
            assert_eq!(chunks[0], &[1, 2, 3]);
        }
    }

    #[test]
    fn series_adapter_add() {
        let adapter1 = PolarsSeriesAdapter(Series::new("a".into(), &[1, 2, 3]));
        let adapter2 = PolarsSeriesAdapter(Series::new("b".into(), &[4, 5, 6]));

        let result = adapter1 + adapter2;
        assert!(result.is_ok());
        let result_series = result.unwrap();
        assert_eq!(result_series.name(), "a"); // NOTE this isn't `a + b`
        assert_eq!(result_series.len(), 3);
        assert!(matches!(result_series.0.dtype(), DataType::Int32));
        assert_eq!(
            result_series
                .0
                .i32()
                .unwrap()
                .into_no_null_iter()
                .collect::<Vec<_>>(),
            vec![5, 7, 9]
        );
    }

    #[test]
    fn series_adapter_sub() {
        let adapter1 = PolarsSeriesAdapter(Series::new("a".into(), &[4, 5, 6]));
        let adapter2 = PolarsSeriesAdapter(Series::new("b".into(), &[1, 2, 3]));

        let result = adapter1 - adapter2;
        assert!(result.is_ok());
        let result_series = result.unwrap();
        assert_eq!(result_series.name(), "a"); // NOTE this isn't `a - b`
        assert_eq!(result_series.len(), 3);
        assert_eq!(
            result_series
                .0
                .i32()
                .unwrap()
                .into_no_null_iter()
                .collect::<Vec<_>>(),
            vec![3, 3, 3]
        );
    }

    #[test]
    fn dataframe_adapter_column_names() {
        let df = df!(
            "alpha" => [1, 2, 3],
            "beta" => [4, 5, 6],
            "gamma" => [7, 8, 9]
        )
        .unwrap();
        let adapter = PolarsDataFrameAdapter(df);

        let names = adapter.column_names();
        assert_eq!(names.len(), 3);

        // Test contents
        assert!(names.contains(&"alpha".to_string()));
        assert!(names.contains(&"beta".to_string()));
        assert!(names.contains(&"gamma".to_string()));

        // Test ordering
        assert_eq!(
            names,
            vec!["alpha".to_string(), "beta".to_string(), "gamma".to_string()]
        );
    }

    #[test]
    fn dataframe_adapter_col() {
        let df = df![
            "numbers" => [10, 20, 30],
            "letters" => ["x", "y", "z"]
        ]
        .unwrap();
        let adapter = PolarsDataFrameAdapter(df);

        // Test successful column retrieval
        let numbers_col = adapter.col("numbers").unwrap();
        assert_eq!(numbers_col.name(), "numbers");
        assert_eq!(numbers_col.len(), 3);

        let letters_col = adapter.col("letters").unwrap();
        assert_eq!(letters_col.name(), "letters");
        assert_eq!(letters_col.len(), 3);

        // Test error case
        let result = adapter.col("nonexistent");
        assert!(result.is_err());
        assert!(matches!(result, Err(PolarsError::ColumnNotFound(_))));
        if let Err(PolarsError::ColumnNotFound(name)) = result {
            assert_eq!(name.to_string(), "\"nonexistent\" not found");
        }
    }

    #[test]
    fn dataframe_adapter_select() {
        let df = df![
            "a" => [1, 2, 3],
            "b" => [4, 5, 6],
            "c" => [7, 8, 9],
            "d" => ["x", "y", "z"]
        ]
        .unwrap();
        let adapter = PolarsDataFrameAdapter(df);

        // Test successful selection
        let selected = adapter.select(vec!["a", "c"]).unwrap();
        assert_eq!(selected.shape(), (3, 2));
        assert_eq!(
            selected.column_names(),
            vec!["a".to_string(), "c".to_string()]
        );

        // Test single column selection
        let single = adapter.select(vec!["d"]).unwrap();
        assert_eq!(single.shape(), (3, 1));
        assert_eq!(single.column_names(), vec!["d".to_string()]);

        // Test empty selection
        let empty = adapter.select(vec![]).unwrap();
        assert_eq!(empty.shape(), (3, 0)); // TODO: Should this be (0, 0)?

        // Test error case
        let result = adapter.select(vec!["a", "nonexistent"]);
        assert!(result.is_err());
        assert!(matches!(result, Err(PolarsError::ColumnNotFound(_))));
        if let Err(PolarsError::ColumnNotFound(name)) = result {
            assert_eq!(name.to_string(), "\"nonexistent\" not found");
        }
    }
}