Skip to main content

datafusion_ffi/
statistics.rs

1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements.  See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership.  The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License.  You may obtain a copy of the License at
8//
9//   http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied.  See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18//! Helpers for moving [`Statistics`] across the FFI boundary as prost-encoded
19//! `datafusion_proto_common::Statistics` bytes.
20//!
21//! [`Statistics`] contains [`Precision<ScalarValue>`] for column min/max/sum,
22//! and `ScalarValue` is a large enum that's impractical to mirror in
23//! `#[repr(C)]`. The proto round-trip already exists in `datafusion-proto-common`
24//! and is the same pattern used to ship filter expressions across the FFI
25//! boundary, so we reuse it here.
26//!
27//! [`Precision<ScalarValue>`]: datafusion_common::stats::Precision
28
29use datafusion_common::{DataFusionError, Result, Statistics};
30use prost::Message;
31
32/// Serialize [`Statistics`] to prost-encoded
33/// `datafusion_proto_common::Statistics` bytes.
34pub(crate) fn serialize_statistics(stats: &Statistics) -> Vec<u8> {
35    datafusion_proto_common::Statistics::from(stats).encode_to_vec()
36}
37
38/// Decode prost-encoded `datafusion_proto_common::Statistics` bytes back into
39/// [`Statistics`].
40pub(crate) fn deserialize_statistics(bytes: &[u8]) -> Result<Statistics> {
41    let proto = datafusion_proto_common::Statistics::decode(bytes).map_err(|e| {
42        DataFusionError::Plan(format!("failed to decode Statistics: {e}"))
43    })?;
44    Statistics::try_from(&proto)
45}
46
47#[cfg(test)]
48mod tests {
49    use std::sync::Arc;
50
51    use arrow::datatypes::{DataType, Field, Schema};
52    use datafusion_common::ScalarValue;
53    use datafusion_common::stats::Precision;
54    use datafusion_common::{ColumnStatistics, Statistics};
55
56    use super::*;
57
58    #[test]
59    fn round_trip_unknown_statistics() {
60        let schema = Schema::new(vec![Field::new("a", DataType::Int32, true)]);
61        let original = Statistics::new_unknown(&Arc::new(schema));
62
63        let bytes = serialize_statistics(&original);
64        let observed = deserialize_statistics(&bytes).expect("decode");
65
66        assert_eq!(observed, original);
67    }
68
69    #[test]
70    fn round_trip_exact_statistics_with_scalar_values() {
71        let original = Statistics {
72            num_rows: Precision::Exact(100),
73            total_byte_size: Precision::Exact(4096),
74            column_statistics: vec![
75                ColumnStatistics {
76                    null_count: Precision::Exact(2),
77                    max_value: Precision::Exact(ScalarValue::Int32(Some(50))),
78                    min_value: Precision::Exact(ScalarValue::Int32(Some(-10))),
79                    sum_value: Precision::Exact(ScalarValue::Int64(Some(1234))),
80                    distinct_count: Precision::Exact(40),
81                    byte_size: Precision::Exact(800),
82                },
83                ColumnStatistics {
84                    null_count: Precision::Exact(0),
85                    max_value: Precision::Exact(ScalarValue::Utf8(Some(
86                        "zebra".to_string(),
87                    ))),
88                    min_value: Precision::Exact(ScalarValue::Utf8(Some(
89                        "ant".to_string(),
90                    ))),
91                    sum_value: Precision::Absent,
92                    distinct_count: Precision::Inexact(95),
93                    byte_size: Precision::Inexact(2048),
94                },
95            ],
96        };
97
98        let bytes = serialize_statistics(&original);
99        let observed = deserialize_statistics(&bytes).expect("decode");
100
101        assert_eq!(observed, original);
102    }
103
104    #[test]
105    fn round_trip_mixed_precision() {
106        let original = Statistics {
107            num_rows: Precision::Inexact(42),
108            total_byte_size: Precision::Absent,
109            column_statistics: vec![ColumnStatistics {
110                null_count: Precision::Absent,
111                max_value: Precision::Inexact(ScalarValue::Float64(Some(1.5))),
112                min_value: Precision::Absent,
113                sum_value: Precision::Inexact(ScalarValue::Float64(Some(63.0))),
114                distinct_count: Precision::Absent,
115                byte_size: Precision::Absent,
116            }],
117        };
118
119        let bytes = serialize_statistics(&original);
120        let observed = deserialize_statistics(&bytes).expect("decode");
121
122        assert_eq!(observed, original);
123    }
124}