Skip to main content

ggplot_rs/data/
source.rs

1use std::collections::HashMap;
2
3use super::{DataFrame, Value};
4
5/// Trait for types that can be converted into our internal DataFrame.
6pub trait GGData {
7    fn into_dataframe(self) -> DataFrame;
8}
9
10/// Convert a polars AnyValue to our internal Value enum.
11#[cfg(feature = "polars")]
12fn polars_anyvalue_to_value(v: polars::datatypes::AnyValue) -> Value {
13    use polars::datatypes::AnyValue;
14    match v {
15        AnyValue::Float64(f) => Value::Float(f),
16        AnyValue::Float32(f) => Value::Float(f as f64),
17        AnyValue::Int64(i) => Value::Integer(i),
18        AnyValue::Int32(i) => Value::Integer(i as i64),
19        AnyValue::Int16(i) => Value::Integer(i as i64),
20        AnyValue::Int8(i) => Value::Integer(i as i64),
21        AnyValue::UInt64(i) => Value::Integer(i as i64),
22        AnyValue::UInt32(i) => Value::Integer(i as i64),
23        AnyValue::UInt16(i) => Value::Integer(i as i64),
24        AnyValue::UInt8(i) => Value::Integer(i as i64),
25        AnyValue::Boolean(b) => Value::Bool(b),
26        AnyValue::String(s) => Value::Str(s.to_string()),
27        AnyValue::StringOwned(s) => Value::Str(s.to_string()),
28        AnyValue::Null => Value::Na,
29        AnyValue::Date(d) => Value::DateTime(d as i64 * 86400),
30        AnyValue::Datetime(us, _, _) => Value::DateTime(us / 1_000_000),
31        AnyValue::Duration(us, _) => Value::Integer(us),
32        AnyValue::Time(ns) => Value::Integer(ns / 1_000_000_000),
33        other => Value::Str(format!("{:?}", other)),
34    }
35}
36
37/// polars DataFrame input: convert each column to our internal format.
38#[cfg(feature = "polars")]
39impl GGData for polars::frame::DataFrame {
40    fn into_dataframe(self) -> DataFrame {
41        let mut df = DataFrame::new();
42        for col in self.get_columns() {
43            let name = col.name().to_string();
44            let values: Vec<Value> = (0..col.len())
45                .map(|i| polars_anyvalue_to_value(col.get(i).unwrap()))
46                .collect();
47            df.add_column(name, values);
48        }
49        df
50    }
51}
52
53/// Extract a single Arrow array into a column of our internal `Value`s.
54///
55/// Nulls become `Value::Na`. Unsupported/complex types fall back to their
56/// `Debug` string so no data is silently dropped.
57#[cfg(feature = "arrow")]
58fn arrow_array_to_values(array: &dyn arrow::array::Array) -> Vec<Value> {
59    use arrow::array::{
60        Array, BooleanArray, Date32Array, Date64Array, Float32Array, Float64Array, Int16Array,
61        Int32Array, Int64Array, Int8Array, LargeStringArray, StringArray,
62        TimestampMicrosecondArray, TimestampMillisecondArray, TimestampNanosecondArray,
63        TimestampSecondArray, UInt16Array, UInt32Array, UInt64Array, UInt8Array,
64    };
65    use arrow::datatypes::{DataType, TimeUnit};
66
67    let n = array.len();
68    // Helper: downcast, then map each row (honouring nulls) via `f`.
69    macro_rules! map_col {
70        ($ty:ty, $f:expr) => {{
71            let a = array.as_any().downcast_ref::<$ty>().unwrap();
72            (0..n)
73                .map(|i| {
74                    if a.is_null(i) {
75                        Value::Na
76                    } else {
77                        $f(a.value(i))
78                    }
79                })
80                .collect()
81        }};
82    }
83
84    match array.data_type() {
85        DataType::Float64 => map_col!(Float64Array, Value::Float),
86        DataType::Float32 => map_col!(Float32Array, |v: f32| Value::Float(v as f64)),
87        DataType::Int64 => map_col!(Int64Array, Value::Integer),
88        DataType::Int32 => map_col!(Int32Array, |v: i32| Value::Integer(v as i64)),
89        DataType::Int16 => map_col!(Int16Array, |v: i16| Value::Integer(v as i64)),
90        DataType::Int8 => map_col!(Int8Array, |v: i8| Value::Integer(v as i64)),
91        DataType::UInt64 => map_col!(UInt64Array, |v: u64| Value::Integer(v as i64)),
92        DataType::UInt32 => map_col!(UInt32Array, |v: u32| Value::Integer(v as i64)),
93        DataType::UInt16 => map_col!(UInt16Array, |v: u16| Value::Integer(v as i64)),
94        DataType::UInt8 => map_col!(UInt8Array, |v: u8| Value::Integer(v as i64)),
95        DataType::Boolean => map_col!(BooleanArray, Value::Bool),
96        DataType::Utf8 => map_col!(StringArray, |v: &str| Value::Str(v.to_string())),
97        DataType::LargeUtf8 => map_col!(LargeStringArray, |v: &str| Value::Str(v.to_string())),
98        // Date32 = days since epoch, Date64 = ms since epoch → seconds.
99        DataType::Date32 => map_col!(Date32Array, |v: i32| Value::DateTime(v as i64 * 86_400)),
100        DataType::Date64 => map_col!(Date64Array, |v: i64| Value::DateTime(v / 1_000)),
101        DataType::Timestamp(unit, _) => match unit {
102            TimeUnit::Second => map_col!(TimestampSecondArray, Value::DateTime),
103            TimeUnit::Millisecond => {
104                map_col!(TimestampMillisecondArray, |v: i64| Value::DateTime(
105                    v / 1_000
106                ))
107            }
108            TimeUnit::Microsecond => {
109                map_col!(TimestampMicrosecondArray, |v: i64| Value::DateTime(
110                    v / 1_000_000
111                ))
112            }
113            TimeUnit::Nanosecond => {
114                map_col!(TimestampNanosecondArray, |v: i64| Value::DateTime(
115                    v / 1_000_000_000
116                ))
117            }
118        },
119        // Anything else (nested, decimal, etc.): keep the data as a string.
120        _ => (0..n)
121            .map(|i| {
122                if array.is_null(i) {
123                    Value::Na
124                } else {
125                    Value::Str(
126                        arrow::util::display::array_value_to_string(array, i).unwrap_or_default(),
127                    )
128                }
129            })
130            .collect(),
131    }
132}
133
134/// Arrow `RecordBatch` input: convert each column to our internal format.
135///
136/// This is the natural bridge for Arrow-native producers such as DuckDB, which
137/// can emit query results directly as `RecordBatch`es without a polars detour.
138#[cfg(feature = "arrow")]
139impl GGData for arrow::record_batch::RecordBatch {
140    fn into_dataframe(self) -> DataFrame {
141        let mut df = DataFrame::new();
142        let schema = self.schema();
143        for (i, field) in schema.fields().iter().enumerate() {
144            let values = arrow_array_to_values(self.column(i).as_ref());
145            df.add_column(field.name().to_string(), values);
146        }
147        df
148    }
149}
150
151/// Row-oriented input: Vec of HashMaps.
152impl GGData for Vec<HashMap<String, Value>> {
153    fn into_dataframe(self) -> DataFrame {
154        if self.is_empty() {
155            return DataFrame::new();
156        }
157
158        // Collect all column names
159        let mut col_names: Vec<String> = Vec::new();
160        for row in &self {
161            for key in row.keys() {
162                if !col_names.contains(key) {
163                    col_names.push(key.clone());
164                }
165            }
166        }
167
168        let mut df = DataFrame::new();
169        for name in &col_names {
170            let values: Vec<Value> = self
171                .iter()
172                .map(|row| row.get(name).cloned().unwrap_or(Value::Na))
173                .collect();
174            df.add_column(name.clone(), values);
175        }
176        df
177    }
178}
179
180/// Column-oriented input: Vec of (name, values) pairs.
181impl GGData for Vec<(String, Vec<Value>)> {
182    fn into_dataframe(self) -> DataFrame {
183        let mut df = DataFrame::new();
184        for (name, values) in self {
185            df.add_column(name, values);
186        }
187        df
188    }
189}
190
191/// Identity: DataFrame passes through.
192impl GGData for DataFrame {
193    fn into_dataframe(self) -> DataFrame {
194        self
195    }
196}
197
198#[cfg(test)]
199mod tests {
200    use super::*;
201
202    #[test]
203    fn test_from_hashmap_vec() {
204        let data = vec![
205            HashMap::from([
206                ("x".to_string(), Value::Float(1.0)),
207                ("y".to_string(), Value::Float(2.0)),
208            ]),
209            HashMap::from([
210                ("x".to_string(), Value::Float(3.0)),
211                ("y".to_string(), Value::Float(4.0)),
212            ]),
213        ];
214
215        let df = data.into_dataframe();
216        assert_eq!(df.nrows(), 2);
217        assert!(df.has_column("x"));
218        assert!(df.has_column("y"));
219    }
220
221    #[cfg(feature = "arrow")]
222    #[test]
223    fn test_from_arrow_record_batch() {
224        use arrow::array::{Float64Array, Int64Array, StringArray};
225        use arrow::record_batch::RecordBatch;
226        use std::sync::Arc;
227
228        let batch = RecordBatch::try_from_iter(vec![
229            (
230                "x",
231                Arc::new(Float64Array::from(vec![Some(1.0), None, Some(3.0)])) as _,
232            ),
233            ("n", Arc::new(Int64Array::from(vec![10, 20, 30])) as _),
234            ("g", Arc::new(StringArray::from(vec!["a", "b", "c"])) as _),
235        ])
236        .unwrap();
237
238        let df = batch.into_dataframe();
239        assert_eq!(df.nrows(), 3);
240        assert_eq!(df.ncols(), 3);
241        assert!(df.has_column("x"));
242        assert!(df.has_column("n"));
243        assert!(df.has_column("g"));
244        // Null in the float column becomes Value::Na.
245        assert_eq!(df.column("x").unwrap()[1], Value::Na);
246        assert_eq!(df.column("n").unwrap()[0], Value::Integer(10));
247        assert_eq!(df.column("g").unwrap()[2], Value::Str("c".to_string()));
248    }
249
250    #[test]
251    fn test_from_column_oriented() {
252        let data = vec![
253            ("x".to_string(), vec![Value::Float(1.0), Value::Float(2.0)]),
254            ("y".to_string(), vec![Value::Float(3.0), Value::Float(4.0)]),
255        ];
256
257        let df = data.into_dataframe();
258        assert_eq!(df.nrows(), 2);
259        assert_eq!(df.ncols(), 2);
260    }
261}