Skip to main content

orc_rust/
statistics.rs

1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements.  See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership.  The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License.  You may obtain a copy of the License at
8//
9//   http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied.  See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18use crate::{error, proto};
19
20/// Contains statistics for a specific column, for the entire file
21/// or for a specific stripe.
22#[derive(Debug, Clone)]
23pub struct ColumnStatistics {
24    number_of_values: u64,
25    /// Use aid in 'IS NULL' predicates
26    has_null: bool,
27    type_statistics: Option<TypeStatistics>,
28}
29
30impl ColumnStatistics {
31    pub fn number_of_values(&self) -> u64 {
32        self.number_of_values
33    }
34
35    pub fn has_null(&self) -> bool {
36        self.has_null
37    }
38
39    pub fn type_statistics(&self) -> Option<&TypeStatistics> {
40        self.type_statistics.as_ref()
41    }
42}
43
44#[derive(Debug, Clone)]
45pub enum TypeStatistics {
46    /// For TinyInt, SmallInt, Int and BigInt
47    Integer {
48        min: i64,
49        max: i64,
50        /// If sum overflows then recorded as None
51        sum: Option<i64>,
52    },
53    /// For Float and Double
54    Double {
55        min: f64,
56        max: f64,
57        /// If sum overflows then recorded as None
58        sum: Option<f64>,
59    },
60    String {
61        lower_bound: String,
62        upper_bound: String,
63        /// Total length of all strings
64        sum: i64,
65        /// If true, 'min' is an exact minimum. If false, it is a lower bound.
66        is_exact_min: bool,
67        /// If true, 'max' is an exact maximum. If false, it is an upper bound.
68        is_exact_max: bool,
69    },
70    /// For Boolean
71    Bucket { true_count: u64 },
72    Decimal {
73        // TODO: use our own decimal type?
74        min: String,
75        max: String,
76        sum: String,
77    },
78    Date {
79        /// Days since epoch
80        min: i32,
81        max: i32,
82    },
83    Binary {
84        // Total number of bytes across all values
85        sum: i64,
86    },
87    Timestamp {
88        /// Milliseconds since epoch
89        /// These were used before ORC-135
90        /// Where local timezone offset was included
91        min: i64,
92        max: i64,
93        /// Milliseconds since UNIX epoch
94        min_utc: i64,
95        max_utc: i64,
96    },
97    Collection {
98        min_children: u64,
99        max_children: u64,
100        total_children: u64,
101    },
102}
103
104impl TryFrom<&proto::ColumnStatistics> for ColumnStatistics {
105    type Error = error::OrcError;
106
107    fn try_from(value: &proto::ColumnStatistics) -> Result<Self, Self::Error> {
108        let type_statistics = if value.number_of_values() == 0 {
109            None
110        } else if let Some(stats) = &value.int_statistics {
111            Some(TypeStatistics::Integer {
112                min: stats.minimum(),
113                max: stats.maximum(),
114                sum: stats.sum,
115            })
116        } else if let Some(stats) = &value.double_statistics {
117            Some(TypeStatistics::Double {
118                min: stats.minimum(),
119                max: stats.maximum(),
120                sum: stats.sum,
121            })
122        } else if let Some(stats) = &value.string_statistics {
123            let (lower_bound, is_exact_min) = stats
124                .minimum
125                .as_deref()
126                .map(|s| (s, true))
127                .unwrap_or_else(|| (stats.lower_bound(), false));
128            let (upper_bound, is_exact_max) = stats
129                .maximum
130                .as_deref()
131                .map(|s| (s, true))
132                .unwrap_or_else(|| (stats.upper_bound(), false));
133            Some(TypeStatistics::String {
134                lower_bound: lower_bound.to_owned(),
135                upper_bound: upper_bound.to_owned(),
136                sum: stats.sum(),
137                is_exact_min,
138                is_exact_max,
139            })
140        } else if let Some(stats) = &value.bucket_statistics {
141            // TODO: false count?
142            Some(TypeStatistics::Bucket {
143                true_count: stats.count[0], // TODO: safety check this
144            })
145        } else if let Some(stats) = &value.decimal_statistics {
146            Some(TypeStatistics::Decimal {
147                min: stats.minimum().to_owned(),
148                max: stats.maximum().to_owned(),
149                sum: stats.sum().to_owned(),
150            })
151        } else if let Some(stats) = &value.date_statistics {
152            Some(TypeStatistics::Date {
153                min: stats.minimum(),
154                max: stats.maximum(),
155            })
156        } else if let Some(stats) = &value.binary_statistics {
157            Some(TypeStatistics::Binary { sum: stats.sum() })
158        } else if let Some(stats) = &value.timestamp_statistics {
159            Some(TypeStatistics::Timestamp {
160                min: stats.minimum(),
161                max: stats.maximum(),
162                min_utc: stats.minimum_utc(),
163                max_utc: stats.maximum_utc(),
164            })
165        } else {
166            value
167                .collection_statistics
168                .as_ref()
169                .map(|stats| TypeStatistics::Collection {
170                    min_children: stats.min_children(),
171                    max_children: stats.max_children(),
172                    total_children: stats.total_children(),
173                })
174        };
175        Ok(Self {
176            number_of_values: value.number_of_values(),
177            has_null: value.has_null(),
178            type_statistics,
179        })
180    }
181}