orc_rust/
statistics.rs

1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements.  See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership.  The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License.  You may obtain a copy of the License at
8//
9//   http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied.  See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18use crate::{error, proto};
19
20/// Contains statistics for a specific column, for the entire file
21/// or for a specific stripe.
22#[derive(Debug, Clone)]
23pub struct ColumnStatistics {
24    number_of_values: u64,
25    /// Use aid in 'IS NULL' predicates
26    has_null: bool,
27    type_statistics: Option<TypeStatistics>,
28}
29
30impl ColumnStatistics {
31    pub fn number_of_values(&self) -> u64 {
32        self.number_of_values
33    }
34
35    pub fn has_null(&self) -> bool {
36        self.has_null
37    }
38
39    pub fn type_statistics(&self) -> Option<&TypeStatistics> {
40        self.type_statistics.as_ref()
41    }
42}
43
44#[derive(Debug, Clone)]
45pub enum TypeStatistics {
46    /// For TinyInt, SmallInt, Int and BigInt
47    Integer {
48        min: i64,
49        max: i64,
50        /// If sum overflows then recorded as None
51        sum: Option<i64>,
52    },
53    /// For Float and Double
54    Double {
55        min: f64,
56        max: f64,
57        /// If sum overflows then recorded as None
58        sum: Option<f64>,
59    },
60    String {
61        min: String,
62        max: String,
63        /// Total length of all strings
64        sum: i64,
65    },
66    /// For Boolean
67    Bucket { true_count: u64 },
68    Decimal {
69        // TODO: use our own decimal type?
70        min: String,
71        max: String,
72        sum: String,
73    },
74    Date {
75        /// Days since epoch
76        min: i32,
77        max: i32,
78    },
79    Binary {
80        // Total number of bytes across all values
81        sum: i64,
82    },
83    Timestamp {
84        /// Milliseconds since epoch
85        /// These were used before ORC-135
86        /// Where local timezone offset was included
87        min: i64,
88        max: i64,
89        /// Milliseconds since UNIX epoch
90        min_utc: i64,
91        max_utc: i64,
92    },
93    Collection {
94        min_children: u64,
95        max_children: u64,
96        total_children: u64,
97    },
98}
99
100impl TryFrom<&proto::ColumnStatistics> for ColumnStatistics {
101    type Error = error::OrcError;
102
103    fn try_from(value: &proto::ColumnStatistics) -> Result<Self, Self::Error> {
104        let type_statistics = if let Some(stats) = &value.int_statistics {
105            Some(TypeStatistics::Integer {
106                min: stats.minimum(),
107                max: stats.maximum(),
108                sum: stats.sum,
109            })
110        } else if let Some(stats) = &value.double_statistics {
111            Some(TypeStatistics::Double {
112                min: stats.minimum(),
113                max: stats.maximum(),
114                sum: stats.sum,
115            })
116        } else if let Some(stats) = &value.string_statistics {
117            Some(TypeStatistics::String {
118                min: stats.minimum().to_owned(),
119                max: stats.maximum().to_owned(),
120                sum: stats.sum(),
121            })
122        } else if let Some(stats) = &value.bucket_statistics {
123            // TODO: false count?
124            Some(TypeStatistics::Bucket {
125                true_count: stats.count[0], // TODO: safety check this
126            })
127        } else if let Some(stats) = &value.decimal_statistics {
128            Some(TypeStatistics::Decimal {
129                min: stats.minimum().to_owned(),
130                max: stats.maximum().to_owned(),
131                sum: stats.sum().to_owned(),
132            })
133        } else if let Some(stats) = &value.date_statistics {
134            Some(TypeStatistics::Date {
135                min: stats.minimum(),
136                max: stats.maximum(),
137            })
138        } else if let Some(stats) = &value.binary_statistics {
139            Some(TypeStatistics::Binary { sum: stats.sum() })
140        } else if let Some(stats) = &value.timestamp_statistics {
141            Some(TypeStatistics::Timestamp {
142                min: stats.minimum(),
143                max: stats.maximum(),
144                min_utc: stats.minimum_utc(),
145                max_utc: stats.maximum_utc(),
146            })
147        } else {
148            value
149                .collection_statistics
150                .as_ref()
151                .map(|stats| TypeStatistics::Collection {
152                    min_children: stats.min_children(),
153                    max_children: stats.max_children(),
154                    total_children: stats.total_children(),
155                })
156        };
157        Ok(Self {
158            number_of_values: value.number_of_values(),
159            has_null: value.has_null(),
160            type_statistics,
161        })
162    }
163}